File size: 1,025 Bytes
504bb0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
from transformers import pipeline

print("Loading models...")

model_a = pipeline(
    "audio-classification",
    model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
)

model_b = pipeline(
    "audio-classification",
    model="superb/wav2vec2-base-superb-er"
)

print("Models ready")


def fuse_results(results_a, results_b):

    combined = {}

    for item in results_a:
        combined[item["label"]] = item["score"]

    for item in results_b:

        if item["label"] in combined:
            combined[item["label"]] += item["score"]
        else:
            combined[item["label"]] = item["score"]

    final = []

    for emotion, score in combined.items():
        final.append({
            "emotion": emotion,
            "confidence": float(score / 2)
        })

    final = sorted(final, key=lambda x: x["confidence"], reverse=True)

    return final


def predict_emotion(audio):

    res_a = model_a(audio)
    res_b = model_b(audio)

    return fuse_results(res_a, res_b)