AlexWortega commited on
Commit
cde5368
·
verified ·
1 Parent(s): 05eef8c

Upload processing_borealis.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. processing_borealis.py +50 -2
processing_borealis.py CHANGED
@@ -4,6 +4,7 @@ Borealis Processor for HuggingFace/vLLM compatibility.
4
  Handles audio feature extraction and tokenization.
5
  """
6
 
 
7
  from typing import List, Optional, Union
8
 
9
  import torch
@@ -28,6 +29,9 @@ class BorealisProcessor(ProcessorMixin):
28
  audio_bos_token = "<|start_of_audio|>"
29
  audio_eos_token = "<|start_of_audio|>" # Reuse bos token since only 2 audio tokens in vocab
30
 
 
 
 
31
  def __init__(
32
  self,
33
  feature_extractor: Optional[WhisperFeatureExtractor] = None,
@@ -59,8 +63,10 @@ class BorealisProcessor(ProcessorMixin):
59
  """
60
  Process text and/or audio inputs.
61
 
 
 
62
  Args:
63
- text: Text prompt(s)
64
  audio: Audio waveform(s) at 16kHz
65
  audios: Audio waveform(s) at 16kHz (vLLM style)
66
  sampling_rate: Audio sampling rate (default: 16000)
@@ -88,20 +94,62 @@ class BorealisProcessor(ProcessorMixin):
88
  for a in audio:
89
  if isinstance(a, torch.Tensor):
90
  a = a.numpy()
 
 
91
  audio_arrays.append(a)
92
 
93
  audio_features = self.feature_extractor(
94
  audio_arrays,
95
  sampling_rate=sampling_rate,
96
  return_tensors=return_tensors,
 
 
97
  )
98
  data["input_features"] = audio_features.input_features
99
 
100
- # Process text if provided
 
 
 
 
 
 
 
 
 
 
 
101
  if text is not None:
102
  if isinstance(text, str):
103
  text = [text]
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  # Filter out kwargs that tokenizer doesn't accept
106
  tok_kwargs = {k: v for k, v in kwargs.items()
107
  if k in ['padding', 'truncation', 'max_length', 'add_special_tokens']}
 
4
  Handles audio feature extraction and tokenization.
5
  """
6
 
7
+ import numpy as np
8
  from typing import List, Optional, Union
9
 
10
  import torch
 
29
  audio_bos_token = "<|start_of_audio|>"
30
  audio_eos_token = "<|start_of_audio|>" # Reuse bos token since only 2 audio tokens in vocab
31
 
32
+ # Borealis architecture parameters
33
+ downsample_factor = 4 # Audio embedding downsampling factor
34
+
35
  def __init__(
36
  self,
37
  feature_extractor: Optional[WhisperFeatureExtractor] = None,
 
63
  """
64
  Process text and/or audio inputs.
65
 
66
+ Expands <|AUDIO|> tokens in text to match the number of audio embeddings.
67
+
68
  Args:
69
+ text: Text prompt(s) containing <|AUDIO|> placeholders
70
  audio: Audio waveform(s) at 16kHz
71
  audios: Audio waveform(s) at 16kHz (vLLM style)
72
  sampling_rate: Audio sampling rate (default: 16000)
 
94
  for a in audio:
95
  if isinstance(a, torch.Tensor):
96
  a = a.numpy()
97
+ if isinstance(a, np.ndarray):
98
+ a = a.astype(np.float32)
99
  audio_arrays.append(a)
100
 
101
  audio_features = self.feature_extractor(
102
  audio_arrays,
103
  sampling_rate=sampling_rate,
104
  return_tensors=return_tensors,
105
+ padding="max_length",
106
+ return_attention_mask=True,
107
  )
108
  data["input_features"] = audio_features.input_features
109
 
110
+ # Calculate audio lengths for token expansion
111
+ # Whisper uses 30s chunks with 3000 mel frames -> 1500 encoder frames
112
+ # Borealis downsamples by 4x -> 375 tokens
113
+ attention_mask = audio_features.get("attention_mask")
114
+ if attention_mask is not None:
115
+ # Sum attention mask to get actual audio length in frames
116
+ audio_lengths = attention_mask.sum(dim=-1).tolist()
117
+ else:
118
+ # Default: assume full 30s audio
119
+ audio_lengths = [3000] * len(audio_arrays)
120
+
121
+ # Process text if provided - expand audio tokens
122
  if text is not None:
123
  if isinstance(text, str):
124
  text = [text]
125
 
126
+ # Expand <|AUDIO|> tokens based on audio lengths
127
+ if audio is not None:
128
+ expanded_text = []
129
+ audio_idx = 0
130
+
131
+ for sample in text:
132
+ while self.audio_token in sample:
133
+ if audio_idx < len(audio_lengths):
134
+ audio_len = audio_lengths[audio_idx]
135
+ # Whisper: 3000 mel frames -> 1500 encoder frames
136
+ # Then downsample by 4 -> 375 tokens
137
+ whisper_frames = (audio_len - 1) // 2 + 1 # ~1500
138
+ num_audio_tokens = whisper_frames // self.downsample_factor # ~375
139
+
140
+ # Expand single <|AUDIO|> to multiple tokens with markers
141
+ expanded = (
142
+ self.audio_bos_token +
143
+ self.audio_token * num_audio_tokens +
144
+ self.audio_eos_token
145
+ )
146
+ sample = sample.replace(self.audio_token, expanded, 1)
147
+ audio_idx += 1
148
+ else:
149
+ break
150
+ expanded_text.append(sample)
151
+ text = expanded_text
152
+
153
  # Filter out kwargs that tokenizer doesn't accept
154
  tok_kwargs = {k: v for k, v in kwargs.items()
155
  if k in ['padding', 'truncation', 'max_length', 'add_special_tokens']}