In [1]:
from classes import *
import torch.optim as optim
from torch.utils.data import DataLoader, ConcatDataset
import torch.nn.functional as F

In [2]:
dataset = FaceVoiceDataset("train", 16000, 0)
augmented_dataset = FaceVoiceDataset("train", 16000, 2)

trainset = ConcatDataset([dataset, augmented_dataset])
valiset = FaceVoiceDataset("dev", 16000, 0)

image, mel_spec, label = dataset[0] 
print(image.shape)      # torch.Size([3, 80, 80])
print(mel_spec.shape)   # torch.Size([1, 64, T])
print(len(trainset), len(valiset))

torch.Size([3, 80, 80])
torch.Size([1, 64, 241])
4464 124


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Zařízení:", device)

Zařízení: cuda


In [4]:
def train_one_epoch(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for face, voice, labels in dataloader:
        face, voice, labels = face.to(device), voice.to(device), labels.to(device)

        outputs = model(face, voice)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * face.size(0)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total
    return avg_loss, accuracy


def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for face, voice, labels in dataloader:
            face, voice, labels = face.to(device), voice.to(device), labels.to(device)

            outputs = model(face, voice)
            loss = criterion(outputs, labels)

            total_loss += loss.item() * face.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / total
    accuracy = correct / total
    return avg_loss, accuracy

In [5]:
def train_model(model, train_loader, val_loader, epochs=10, lr=1e-4):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_val_acc = 0.0
    best_model_state = None

    print("Training starts...")
    for epoch in range(epochs):
        train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer)
        val_loss, val_acc = evaluate(model, val_loader, criterion)

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"  Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
        print(f"  Val   Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict()

    # Na konci ulož nejlepší model
    if best_model_state:
        torch.save(best_model_state, "best_model.pth")
        print(f"Nejlepší model uložen s val acc {best_val_acc:.4f}")

In [6]:
train_loader = DataLoader(trainset, batch_size=32, shuffle=True)
val_loader = DataLoader(valiset, batch_size=32)

model = FaceVoiceClassifier(num_classes=31)

train_model(model, train_loader, val_loader, epochs=5, lr=1e-4)

Dims: 12800 15360 28160
Training starts...
Epoch 1/5
  Train Loss: 1.8800, Acc: 0.5137
  Val   Loss: 2.4913, Acc: 0.2903
Epoch 2/5
  Train Loss: 0.4304, Acc: 0.9272
  Val   Loss: 2.3519, Acc: 0.4597
Epoch 3/5
  Train Loss: 0.1840, Acc: 0.9720
  Val   Loss: 2.2609, Acc: 0.5323
Epoch 4/5
  Train Loss: 0.1118, Acc: 0.9841
  Val   Loss: 2.2795, Acc: 0.5403
Epoch 5/5
  Train Loss: 0.0657, Acc: 0.9933
  Val   Loss: 2.1442, Acc: 0.5323
Nejlepší model uložen s val acc 0.5403


In [None]:
classifier = FaceVoiceClassifier(num_classes=31)
#classifier.load_state_dict(torch.load("8548-10-20_best_model.pth", map_location=device))
classifier.load_state_dict(torch.load("best_model.pth", map_location=device))
classifier.to(device)
classifier.eval()

with torch.no_grad():
    evalset = FaceVoiceEvalset("eval", 16000)
    eval_loader = DataLoader(evalset, batch_size=1, shuffle=False)
    
    for face, voice, filename in eval_loader:
        face, voice = face.to(device), voice.to(device)
        outputs = classifier(face, voice)
        _, preds = torch.max(outputs, 1)
        ll = F.log_softmax(outputs, dim=1)
        print(f"{filename[0]} {preds.item() + 1}", ' '.join(f'{v:.2f}' for v in ll.flatten().tolist()))

Dims: 12800 15360 28160
eval_00001 10 -10.37 -13.80 -14.47 -11.51 -10.17 -6.24 -9.21 -3.71 -4.67 -0.04 -12.17 -11.96 -18.61 -10.10 -14.98 -27.24 -11.72 -6.94 -15.96 -14.81 -17.90 -7.84 -20.79 -11.37 -10.43 -14.25 -8.46 -11.84 -14.95 -21.12 -25.84
eval_00002 21 -21.63 -21.91 -19.74 -10.62 -13.36 -16.73 -19.36 -24.78 -18.23 -19.26 -20.33 -19.68 -20.32 -16.76 -20.61 -8.21 -17.32 -25.25 -15.98 -14.53 -0.01 -16.57 -5.51 -5.28 -12.68 -24.91 -16.22 -12.57 -14.66 -15.30 -20.18
eval_00003 8 -24.39 -23.64 -31.94 -27.51 -21.65 -18.73 -18.18 -0.00 -14.62 -12.42 -39.55 -16.81 -29.10 -26.53 -34.66 -46.30 -27.31 -15.16 -29.51 -25.46 -32.04 -20.92 -43.13 -23.08 -28.75 -27.68 -23.28 -23.73 -36.33 -40.16 -42.90
eval_00004 11 -32.73 -44.78 -22.14 -24.48 -27.28 -25.47 -33.28 -23.91 -29.62 -28.62 0.00 -45.58 -35.75 -25.10 -27.14 -40.54 -36.49 -34.53 -25.18 -47.45 -37.55 -31.76 -39.63 -49.09 -32.81 -41.04 -31.86 -43.25 -49.76 -47.39 -41.51
eval_00005 3 -10.24 -16.45 -0.00 -5.76 -13.74 -13.64 -14.77 -18.86 -