import numpy as np
from transformers import pipeline

print("Loading models...")

model_a = pipeline(
    "audio-classification",
    model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
)

model_b = pipeline(
    "audio-classification",
    model="superb/wav2vec2-base-superb-er"
)

print("Models ready")


def fuse_results(results_a, results_b):

    combined = {}

    for item in results_a:
        combined[item["label"]] = item["score"]

    for item in results_b:

        if item["label"] in combined:
            combined[item["label"]] += item["score"]
        else:
            combined[item["label"]] = item["score"]

    final = []

    for emotion, score in combined.items():
        final.append({
            "emotion": emotion,
            "confidence": float(score / 2)
        })

    final = sorted(final, key=lambda x: x["confidence"], reverse=True)

    return final


def predict_emotion(audio):

    res_a = model_a(audio)
    res_b = model_b(audio)

    return fuse_results(res_a, res_b)