File size: 2,402 Bytes
a0e0ff1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from model import CustomBERTModel
from config import Config
import pandas as pd
from tqdm import tqdm

def load_data(file_path):
    df = pd.read_csv(file_path, header=None)
    return torch.tensor(df.values, dtype=torch.float32)

def create_mlm_data(data, mlm_probability):
    labels = data.clone()
    probability_matrix = torch.full(labels.shape, mlm_probability)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    data[indices_replaced] = 0  # Assume 0 is the representation of [MASK]

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(Config.vocab_size, labels.shape, dtype=torch.long)
    data[indices_random] = random_words[indices_random].float()

    return data, labels

def test():
    config = Config()
    model = CustomBERTModel(config).to(config.device)
    model.load_state_dict(torch.load("bert_mlm_model.pth"))
    model.eval()

    test_data = load_data(config.test_file)
    test_dataset = TensorDataset(test_data)
    test_loader = DataLoader(test_dataset, batch_size=config.batch_size)

    total_loss = 0
    total_correct = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            inputs = batch[0].to(config.device)
            masked_inputs, labels = create_mlm_data(inputs, config.mlm_probability)
            
            outputs = model(masked_inputs, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            predictions = outputs.logits.argmax(dim=-1)
            mask = labels != -100
            total_correct += (predictions[mask] == labels[mask]).sum().item()
            total_predictions += mask.sum().item()

    avg_loss = total_loss / len(test_loader)
    accuracy = total_correct / total_predictions

    print(f"Test Loss: {avg_loss:.4f}")
    print(f"Test Accuracy: {accuracy:.4f}")

if __name__ == "__main__":
    test()