| # Model Loading and Testing Instructions |
|
|
| This document provides step-by-step instructions on how to load our model from the Hugging Face Hub and evaluate it on a test dataset. |
| The following code load and test the models on colab notebook. |
|
|
| --- |
|
|
| # Step 1: Prerequisites |
|
|
| 1. Import the required Python packages: |
|
|
| ```python |
| from huggingface_hub import login |
| import torch |
| import torch.nn as nn |
| from transformers import RobertaForSequenceClassification, RobertaTokenizer |
| from torch.utils.data import Dataset, DataLoader |
| import pandas as pd |
| import numpy as np |
| import re |
| from sklearn.metrics import accuracy_score |
| from transformers import AutoModel, AutoTokenizer |
| from huggingface_hub import login |
| ``` |
| 2. Log in by using the account (see our Ed private post & email sent to TAs, thanks!): |
|
|
| ```python |
| login("Replace with the key") |
| ``` |
|
|
| # Step 2: Define the preprocessing and dataset clas |
|
|
| Run the following class and functions designed to preprocess the test data |
|
|
| ```python |
| class NewsDataset(Dataset): |
| def __init__(self, texts, labels, tokenizer, max_len=128): |
| self.texts = texts |
| self.labels = labels |
| self.tokenizer = tokenizer |
| self.max_len = max_len |
| |
| def __len__(self): |
| return len(self.texts) |
| |
| def __getitem__(self, idx): |
| text = self.texts[idx] |
| label = self.labels[idx] |
| encoding = self.tokenizer( |
| text, |
| max_length=self.max_len, |
| padding="max_length", |
| truncation=True, |
| return_tensors="pt" |
| ) |
| return { |
| "input_ids": encoding["input_ids"].squeeze(), |
| "attention_mask": encoding["attention_mask"].squeeze(), |
| "labels": torch.tensor(label, dtype=torch.long) |
| } |
| |
| def preprocess_text(text): |
| """Clean and preprocess text.""" |
| text = str(text) |
| contractions = { |
| "n't": " not", |
| "'s": " is", |
| "'ll": " will", |
| "'ve": " have" |
| } |
| for contraction, expansion in contractions.items(): |
| text = text.replace(contraction, expansion) |
| text = re.sub(r'\$\\d+\.?\\d*\s*(million|billion|trillion)?', r'$ \1', text, flags=re.IGNORECASE) |
| text = re.sub(r'http\\S+', '', text) |
| text = re.sub(r'-', ' ', text) |
| text = text.lower() |
| text = ' '.join(text.split()) |
| return text |
| ``` |
|
|
|
|
| # Step 3: Load the model and tokenizer from Hugging Face Hub |
| This step loads the pre-trained model and tokenizer, which are hosted on the Hugging Face Hub. |
|
|
| ```python |
| print("Loading model and tokenizer...") |
| REPO_NAME = "CIS5190GoGo/CustomModel" #This is where we pushed the model to |
| model = RobertaForSequenceClassification.from_pretrained(REPO_NAME) |
| tokenizer = RobertaTokenizer.from_pretrained(REPO_NAME) |
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model.to(device) |
| print("Model and tokenizer loaded successfully!") |
| ``` |
|
|
| # Step 4: Load test dataset |
| ```python |
| print("Loading test data...") |
| test_data_path = "Replace wit your test set path" #Note: Replace with your test set path |
| test_data = pd.read_csv(test_data_path) |
| ``` |
| # Step 5: Preprocess test data |
| ```python |
| X_test = test_data['title'].apply(preprocess_text).values |
| y_test = test_data['labels'].values |
| ``` |
|
|
| # Step 6: Prepare the dataset and dataloader |
| ```python |
| test_dataset = NewsDataset(X_test, y_test, tokenizer) |
| test_loader = DataLoader(test_dataset, batch_size=16, num_workers=2) |
| ``` |
|
|
| # Step 7: Evaluate the model and calculate accuracy |
| ```python |
| print("Evaluating the model...") |
| model.eval() |
| all_preds, all_labels = [], [] |
| |
| with torch.no_grad(): |
| for batch in test_loader: |
| input_ids = batch["input_ids"].to(device) |
| attention_mask = batch["attention_mask"].to(device) |
| labels = batch["labels"].to(device) |
| |
| outputs = model(input_ids=input_ids, attention_mask=attention_mask) |
| preds = torch.argmax(outputs.logits, dim=-1) |
| |
| all_preds.extend(preds.cpu().numpy()) |
| all_labels.extend(labels.cpu().numpy()) |
| |
| accuracy = accuracy_score(all_labels, all_preds) |
| print(f"Test Accuracy: {accuracy:.4f}") |
| ``` |
| # Expected output: |
| ```python |
| Loading model and tokenizer... |
| Model and tokenizer loaded successfully! |
| Loading test data... |
| Evaluating the model... |
| Test Accuracy: 0.8500 |
| ``` |