camenduru / emage
EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via Expressive Masked Audio Gesture Modeling
Prediction
camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73IDbbtdxcbg3hrgg0cer7es72m7e4StatusSucceededSourceWebHardwareA40Total durationCreatedInput
- audio_path
- Video Player is loading.Current Time 00:00:000/Duration 00:00:000Loaded: 0%Stream Type LIVERemaining Time -00:00:0001x
- Chapters
- descriptions off, selected
- captions settings, opens captions settings dialog
- captions off, selected
This is a modal window.
Beginning of dialog window. Escape will cancel and close the window.
End of dialog window.
{ "audio_path": "https://replicate.delivery/pbxt/KiSDBoOlK2UD4byZ5eQsx8UxvcS1HEwS5T5jiMfzyRZ5UiVv/tmp.wav" }
Install Replicate’s Node.js client library:npm install replicate
Import and set up the client:import Replicate from "replicate"; const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN, });
Run camenduru/emage using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.
const output = await replicate.run( "camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73", { input: { audio_path: "https://replicate.delivery/pbxt/KiSDBoOlK2UD4byZ5eQsx8UxvcS1HEwS5T5jiMfzyRZ5UiVv/tmp.wav" } } ); // To access the file URL: console.log(output[0].url()); //=> "http://example.com" // To write the file to disk: fs.writeFile("my-image.png", output[0]);
To learn more, take a look at the guide on getting started with Node.js.
Install Replicate’s Python client library:pip install replicate
Import the client:import replicate
Run camenduru/emage using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.
output = replicate.run( "camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73", input={ "audio_path": "https://replicate.delivery/pbxt/KiSDBoOlK2UD4byZ5eQsx8UxvcS1HEwS5T5jiMfzyRZ5UiVv/tmp.wav" } ) print(output)
To learn more, take a look at the guide on getting started with Python.
Run camenduru/emage using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.
curl -s -X POST \ -H "Authorization: Bearer $REPLICATE_API_TOKEN" \ -H "Content-Type: application/json" \ -H "Prefer: wait" \ -d $'{ "version": "camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73", "input": { "audio_path": "https://replicate.delivery/pbxt/KiSDBoOlK2UD4byZ5eQsx8UxvcS1HEwS5T5jiMfzyRZ5UiVv/tmp.wav" } }' \ https://api.replicate.com/v1/predictions
To learn more, take a look at Replicate’s HTTP API reference docs.
Output
{ "completed_at": "2024-04-09T08:01:02.230486Z", "created_at": "2024-04-09T07:58:18.652000Z", "data_removed": false, "error": null, "id": "bbtdxcbg3hrgg0cer7es72m7e4", "input": { "audio_path": "https://replicate.delivery/pbxt/KiSDBoOlK2UD4byZ5eQsx8UxvcS1HEwS5T5jiMfzyRZ5UiVv/tmp.wav" }, "logs": "2024-04-09 08:00:42.977 | INFO | utils.other_tools_hf:print_exp_info:877 - {'a_encoder': None,\n'a_fix_pre': False,\n'a_pre_encoder': None,\n'acc_weight': 0.0,\n'additional_data': False,\n'adv_weight': 20.0,\n'ali_weight': 0.0,\n'amsgrad': False,\n'apex': False,\n'asmr': 0.0,\n'atcont': 0.0,\n'atmr': 0.0,\n'aud_prob': 1.0,\n'audio_dims': 1,\n'audio_f': 256,\n'audio_fps': 16000,\n'audio_norm': False,\n'audio_rep': 'wave16k',\n'audio_sr': 16000,\n'batch_size': 64,\n'beat_align': True,\n'benchmark': True,\n'cache_only': False,\n'cache_path': './datasets/beat_cache/beat_smplx_en_emage_test/',\n'cf': 0.0,\n'ch': 1.0,\n'cl': 1.0,\n'clean_final_seconds': 0,\n'clean_first_seconds': 0,\n'config': './configs/emage_test_hf.yaml',\n'csv_name': 'a2g_0',\n'cu': 1.0,\n'cudnn_enabled': True,\n'd_lr_weight': 0.2,\n'd_name': None,\n'data_path': './EMAGE/test_sequences/',\n'data_path_1': './EMAGE/',\n'dataset': 'beat_testonly_hf',\n'ddp': False,\n'debug': False,\n'decay_epochs': 9999,\n'decay_rate': 0.1,\n'decode_fusion': None,\n'deterministic': True,\n'disable_filtering': False,\n'div_reg_weight': 0.0,\n'dropout_prob': 0.3,\n'e_name': 'VAESKConv',\n'e_path': 'weights/AESKConv_240_100.bin',\n'emo_rep': None,\n'emotion_dims': 8,\n'emotion_f': 0,\n'epoch_stage': 0,\n'epochs': 400,\n'eval_model': 'motion_representation',\n'f_encoder': 'null',\n'f_fix_pre': False,\n'f_pre_encoder': 'null',\n'fac_prob': 1.0,\n'facial_dims': 100,\n'facial_f': 0,\n'facial_fps': 15,\n'facial_norm': False,\n'facial_rep': 'smplxflame_30',\n'fid_weight': 0.0,\n'finger_net': 'original',\n'freeze_wordembed': True,\n'fsmr': 0.0,\n'ftmr': 0.0,\n'fusion_mode': 'sum',\n'g_name': 'MAGE_Transformer',\n'gap_weight': 0.0,\n'gpus': [0],\n'grad_norm': 0.99,\n'hidden_size': 768,\n'id_rep': 'onehot',\n'input_context': 'both',\n'is_train': True,\n'ita_weight': 0.0,\n'iwa_weight': 0.0,\n'kld_aud_weight': 0.0,\n'kld_fac_weight': 0.0,\n'kld_weight': 0.0,\n'l': 4,\n'lf': 3.0,\n'lh': 3.0,\n'll': 3.0,\n'loader_workers': 0,\n'log_period': 10,\n'loss_contrastive_neg_weight': 0.005,\n'loss_contrastive_pos_weight': 0.2,\n'loss_gan_weight': 5.0,\n'loss_kld_weight': 0.1,\n'loss_physical_weight': 0.0,\n'loss_reg_weight': 0.05,\n'loss_regression_weight': 70.0,\n'lr_base': 0.0005,\n'lr_min': 1e-07,\n'lr_policy': 'step',\n'lu': 3.0,\n'm_decoder': None,\n'm_encoder': 'null',\n'm_fix_pre': False,\n'm_pre_encoder': 'null',\n'mean_pose_path': '/datasets/trinity/train/',\n'model': 'emage_audio',\n'momentum': 0.8,\n'motion_f': 256,\n'msmr': 0.0,\n'mtmr': 0.0,\n'multi_length_training': [1.0],\n'n_layer': 1,\n'n_poses': 34,\n'n_pre_poses': 4,\n'name': '0409_080042_emage_test_hf',\n'nesterov': True,\n'new_cache': True,\n'no_adv_epoch': 999,\n'notes': '',\n'opt': 'adam',\n'opt_betas': [0.5, 0.999],\n'ori_joints': 'beat_smplx_joints',\n'out_path': './outputs/audio2pose/',\n'pos_encoding_type': 'sin',\n'pos_prob': 1.0,\n'pose_dims': 330,\n'pose_fps': 30,\n'pose_length': 64,\n'pose_norm': False,\n'pose_rep': 'smplxflame_30',\n'pre_frames': 4,\n'pre_type': 'zero',\n'pretrain': False,\n'project': 's2g',\n'queue_size': 1024,\n'random_seed': 2021,\n'rec_aud_weight': 0.0,\n'rec_fac_weight': 0.0,\n'rec_pos_weight': 0.0,\n'rec_txt_weight': 0.0,\n'rec_ver_weight': 0.0,\n'rec_weight': 1.0,\n'render_concurrent_num': 1,\n'render_tmp_img_filetype': 'bmp',\n'render_video_fps': 30,\n'render_video_height': 720,\n'render_video_width': 1920,\n'root_path': './',\n'rot6d': True,\n'sem_rep': None,\n'sparse': 1,\n'speaker_dims': 4,\n'speaker_f': 0,\n'speaker_id': 'onehot',\n'stat': 'ts',\n'std_pose_path': '/datasets/trinity/train/',\n'stride': 20,\n't_encoder': None,\n't_fix_pre': False,\n't_pre_encoder': None,\n'tar_joints': 'beat_smplx_full',\n'test_ckpt': './EMAGE/emage_audio_175.bin',\n'test_data_path': '/datasets/trinity/test/',\n'test_length': 64,\n'test_period': 20,\n'train_data_path': '/datasets/trinity/train/',\n'train_trans': True,\n'trainer': 'emage',\n'training_speakers': [2],\n'tsmr': 0.0,\n'ttmr': 0.0,\n'txt_prob': 1.0,\n'use_aug': False,\n'vae_codebook_size': 256,\n'vae_grow': [1, 1, 2, 1],\n'vae_layer': 4,\n'vae_length': 240,\n'vae_quantizer_lambda': 1.0,\n'vae_test_dim': 330,\n'vae_test_len': 32,\n'vae_test_stride': 20,\n'val_data_path': '/datasets/trinity/val/',\n'variational': False,\n'vel_weight': 0.0,\n'warmup_epochs': 0,\n'warmup_lr': 0.0005,\n'wei_weight': 0.0,\n'weight_decay': 0.0,\n'word_cache': False,\n'word_dims': 300,\n'word_f': 0,\n'word_index_num': 5793,\n'word_rep': None,\n'z_type': 'speaker'}\n2024-04-09 08:00:42.977 | INFO | utils.other_tools_hf:print_exp_info:878 - # ------------ 0409_080042_emage_test_hf ----------- #\n2024-04-09 08:00:42.977 | INFO | utils.other_tools_hf:print_exp_info:879 - PyTorch version: 2.2.0+cu121\n2024-04-09 08:00:42.977 | INFO | utils.other_tools_hf:print_exp_info:880 - CUDA version: 12.1\n2024-04-09 08:00:42.994 | INFO | utils.other_tools_hf:print_exp_info:881 - 1 GPUs\n2024-04-09 08:00:42.994 | INFO | utils.other_tools_hf:print_exp_info:882 - Random Seed: 2021\n/tmp/tmp_2wsoms8tmp.wav\n2024-04-09 08:00:49.943 | INFO | dataloaders.beat_testonly_hf:build_cache:90 - Audio bit rate: 16000\n2024-04-09 08:00:49.943 | INFO | dataloaders.beat_testonly_hf:build_cache:91 - Reading data './EMAGE/test_sequences/'...\n2024-04-09 08:00:49.943 | INFO | dataloaders.beat_testonly_hf:build_cache:92 - Creating the dataset cache...\n2024-04-09 08:00:49.945 | INFO | dataloaders.beat_testonly_hf:cache_generation:140 - # ---- Building cache for Pose dummy 2nd ---- #\n2024-04-09 08:00:52.632 | INFO | dataloaders.beat_testonly_hf:cache_generation:214 - # ---- Building cache for Facial dummy 2nd and Pose dummy 2nd ---- #\n2024-04-09 08:00:52.634 | INFO | dataloaders.beat_testonly_hf:cache_generation:260 - # ---- Building cache for Audio dummy 2nd and Pose dummy 2nd ---- #\n22050\n(108446,)\n(78691,)\n1906\n2024-04-09 08:00:52.636 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:517 - audio: 4s, pose: 63s, facial: 63s\n2024-04-09 08:00:52.636 | WARNING | dataloaders.beat_testonly_hf:_sample_from_clip:521 - reduce to 4s, ignore 59s\n2024-04-09 08:00:52.636 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:544 - pose from frame 0 to 120, length 120\n2024-04-09 08:00:52.636 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:545 - 1 clips is expected with stride 120\n2024-04-09 08:00:52.636 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:549 - audio from frame 0 to 64000, length 64000\n2024-04-09 08:00:52.638 | INFO | dataloaders.beat_testonly_hf:cache_generation:478 - no. of samples: 1\n2024-04-09 08:00:52.639 | INFO | dataloaders.beat_testonly_hf:cache_generation:483 - no. of excluded samples: 0 (0.0%)\n2024-04-09 08:00:52.639 | INFO | predict:__init__:610 - Init test dataloader success\n2024-04-09 08:00:53.217 | INFO | predict:__init__:623 - DataParallel(\n(module): MAGE_Transformer(\n(audio_pre_encoder_face): WavEncoder(\n(feat_extractor): Sequential(\n(0): BasicBlock(\n(conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(1): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(2): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(3): BasicBlock(\n(conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(4): BasicBlock(\n(conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(5): BasicBlock(\n(conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n)\n)\n(audio_pre_encoder_body): WavEncoder(\n(feat_extractor): Sequential(\n(0): BasicBlock(\n(conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(1): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(2): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(3): BasicBlock(\n(conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(4): BasicBlock(\n(conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(5): BasicBlock(\n(conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n)\n)\n(motion_encoder): VQEncoderV6(\n(main): Sequential(\n(0): Conv1d(337, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): ResBlock(\n(model): Sequential(\n(0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n)\n)\n(3): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(4): LeakyReLU(negative_slope=0.2, inplace=True)\n(5): ResBlock(\n(model): Sequential(\n(0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n)\n)\n(6): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(7): LeakyReLU(negative_slope=0.2, inplace=True)\n(8): ResBlock(\n(model): Sequential(\n(0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n)\n)\n)\n)\n(feature2face): Linear(in_features=512, out_features=768, bias=True)\n(face2latent): Linear(in_features=768, out_features=256, bias=True)\n(transformer_de_layer): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n(face_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0-3): 4 x TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(position_embeddings): PeriodicPositionalEncoding(\n(dropout): Dropout(p=0.1, inplace=False)\n)\n(transformer_en_layer): TransformerEncoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n)\n(motion_self_encoder): TransformerEncoder(\n(layers): ModuleList(\n(0): TransformerEncoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(audio_feature2motion): Linear(in_features=256, out_features=768, bias=True)\n(feature2motion): Linear(in_features=256, out_features=768, bias=True)\n(bodyhints_face): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(bodyhints_body): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(motion2latent_upper): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=768, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=768, bias=True)\n)\n)\n(motion2latent_hands): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=768, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=768, bias=True)\n)\n)\n(motion2latent_lower): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=768, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=768, bias=True)\n)\n)\n(wordhints_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0-7): 8 x TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(upper_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(hands_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(lower_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(face_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(upper_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(hands_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(lower_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(motion_down_upper): Linear(in_features=768, out_features=256, bias=True)\n(motion_down_hands): Linear(in_features=768, out_features=256, bias=True)\n(motion_down_lower): Linear(in_features=768, out_features=256, bias=True)\n(spearker_encoder_body): Embedding(25, 768)\n(spearker_encoder_face): Embedding(25, 768)\n)\n)\n2024-04-09 08:00:53.219 | INFO | predict:__init__:624 - init MAGE_Transformer success\n2024-04-09 08:00:53.510 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 08:00:53.517 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 08:00:53.524 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 08:00:53.536 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 08:00:53.546 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 08:00:53.880 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for MAGE_Transformer\ngenerate_silent_videos concurrentNum=1 time=1712649654.8881638\nsubprocess_index=0 begin_ts=1712649655.6305969\nprocessed 0 frames\nsubprocess_index=0 render=3.80 all=4.04 begin_ts=1712649655.63 render_end_ts=1712649659.43 write_end_ts=1712649659.67\nffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers\nbuilt with gcc 12 (Debian 12.2.0-14)\nconfiguration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared\nlibavutil 57. 28.100 / 57. 28.100\nlibavcodec 59. 37.100 / 59. 37.100\nlibavformat 59. 27.100 / 59. 27.100\nlibavdevice 59. 7.100 / 59. 7.100\nlibavfilter 8. 44.100 / 8. 44.100\nlibswscale 6. 7.100 / 6. 7.100\nlibswresample 4. 7.100 / 4. 7.100\nlibpostproc 56. 6.100 / 56. 6.100\nInput #0, image2, from './outputs/audio2pose/custom/hf//999/frame_%d.bmp':\nDuration: 00:00:02.00, start: 0.000000, bitrate: N/A\nStream #0:0: Video: bmp, bgr24, 1920x720, 30 fps, 30 tbr, 30 tbn\nStream mapping:\nStream #0:0 -> #0:0 (bmp (native) -> h264 (libx264))\nPress [q] to stop, [?] for help\n[libx264 @ 0x55679f7835c0] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2\n[libx264 @ 0x55679f7835c0] profile High, level 4.0, 4:2:0, 8-bit\n[libx264 @ 0x55679f7835c0] 264 - core 164 r3095 baee400 - H.264/MPEG-4 AVC codec - Copyleft 2003-2022 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=6 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00\nOutput #0, mp4, to './outputs/audio2pose/custom/hf//999/silence_video.mp4':\nMetadata:\nencoder : Lavf59.27.100\nStream #0:0: Video: h264 (avc1 / 0x31637661), yuv420p(tv, progressive), 1920x720, q=2-31, 30 fps, 15360 tbn\nMetadata:\nencoder : Lavc59.37.100 libx264\nSide data:\ncpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A\nframe= 1 fps=0.0 q=0.0 size= 0kB time=00:00:00.00 bitrate=N/A speed=N/A\nframe= 60 fps=0.0 q=-1.0 Lsize= 181kB time=00:00:01.90 bitrate= 779.0kbits/s speed=3.28x\nvideo:179kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.856900%\n[libx264 @ 0x55679f7835c0] frame I:1 Avg QP:23.69 size: 19244\n[libx264 @ 0x55679f7835c0] frame P:15 Avg QP:25.49 size: 6042\n[libx264 @ 0x55679f7835c0] frame B:44 Avg QP:30.22 size: 1656\n[libx264 @ 0x55679f7835c0] consecutive B-frames: 1.7% 0.0% 5.0% 93.3%\n[libx264 @ 0x55679f7835c0] mb I I16..4: 8.0% 83.0% 8.9%\n[libx264 @ 0x55679f7835c0] mb P I16..4: 0.4% 1.2% 0.5% P16..4: 10.2% 3.6% 1.6% 0.0% 0.0% skip:82.5%\n[libx264 @ 0x55679f7835c0] mb B I16..4: 0.0% 0.1% 0.0% B16..8: 6.4% 1.6% 0.3% direct: 0.3% skip:91.3% L0:43.6% L1:45.9% BI:10.5%\n[libx264 @ 0x55679f7835c0] 8x8 transform intra:76.9% inter:44.4%\n[libx264 @ 0x55679f7835c0] coded y,uvDC,uvAC intra: 19.7% 0.0% 0.0% inter: 1.5% 0.0% 0.0%\n[libx264 @ 0x55679f7835c0] i16 v,h,dc,p: 21% 19% 6% 54%\n[libx264 @ 0x55679f7835c0] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 61% 5% 24% 1% 1% 2% 1% 2% 1%\n[libx264 @ 0x55679f7835c0] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 41% 14% 14% 5% 5% 7% 5% 7% 3%\n[libx264 @ 0x55679f7835c0] i8c dc,h,v,p: 100% 0% 0% 0%\n[libx264 @ 0x55679f7835c0] Weighted P-Frames: Y:0.0% UV:0.0%\n[libx264 @ 0x55679f7835c0] ref P L0: 60.6% 8.9% 21.7% 8.9%\n[libx264 @ 0x55679f7835c0] ref B L0: 84.4% 12.2% 3.4%\n[libx264 @ 0x55679f7835c0] ref B L1: 95.5% 4.5%\n[libx264 @ 0x55679f7835c0] kb/s:731.05\nVideo conversion successful. Output file: ./outputs/audio2pose/custom/hf//999/silence_video.mp4\nffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers\nbuilt with gcc 12 (Debian 12.2.0-14)\nconfiguration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared\nlibavutil 57. 28.100 / 57. 28.100\nlibavcodec 59. 37.100 / 59. 37.100\nlibavformat 59. 27.100 / 59. 27.100\nlibavdevice 59. 7.100 / 59. 7.100\nlibavfilter 8. 44.100 / 8. 44.100\nlibswscale 6. 7.100 / 6. 7.100\nlibswresample 4. 7.100 / 4. 7.100\nlibpostproc 56. 6.100 / 56. 6.100\nInput #0, mov,mp4,m4a,3gp,3g2,mj2, from './outputs/audio2pose/custom/hf//999/silence_video.mp4':\nMetadata:\nmajor_brand : isom\nminor_version : 512\ncompatible_brands: isomiso2avc1mp41\nencoder : Lavf59.27.100\nDuration: 00:00:02.00, start: 0.000000, bitrate: 740 kb/s\nStream #0:0[0x1](und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, 733 kb/s, 30 fps, 30 tbr, 15360 tbn (default)\nMetadata:\nhandler_name : VideoHandler\nvendor_id : [0][0][0][0]\nencoder : Lavc59.37.100 libx264\nGuessed Channel Layout for Input Stream #1.0 : mono\nInput #1, wav, from '/tmp/tmp_2wsoms8tmp.wav':\nDuration: 00:00:04.92, bitrate: 384 kb/s\nStream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, mono, s16, 384 kb/s\nStream mapping:\nStream #0:0 -> #0:0 (copy)\nStream #1:0 -> #0:1 (pcm_s16le (native) -> aac (native))\nPress [q] to stop, [?] for help\nOutput #0, mp4, to './outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4':\nMetadata:\nmajor_brand : isom\nminor_version : 512\ncompatible_brands: isomiso2avc1mp41\nencoder : Lavf59.27.100\nStream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, q=2-31, 733 kb/s, 30 fps, 30 tbr, 15360 tbn (default)\nMetadata:\nhandler_name : VideoHandler\nvendor_id : [0][0][0][0]\nencoder : Lavc59.37.100 libx264\nStream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 24000 Hz, mono, fltp, 69 kb/s\nMetadata:\nencoder : Lavc59.37.100 aac\nframe= 0 fps=0.0 q=-1.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x\nframe= 60 fps=0.0 q=-1.0 Lsize= 200kB time=00:00:02.04 bitrate= 798.9kbits/s speed= 128x\nvideo:179kB audio:18kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 1.498796%\n[aac @ 0x55a5bccc5480] Qavg: 651.317\nVideo with audio generated successfully: ./outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4", "metrics": { "predict_time": 20.17069, "total_time": 163.578486 }, "output": [ "https://replicate.delivery/pbxt/49ufYatYrcS2ISwQsUszerAwfpdBVXZPtswPPeeeBpAOvnKqE/res_2_scott_0_3_3.mp4", "https://replicate.delivery/pbxt/Ib7wTfo0lbVUVq6PLxPH7Zu5P54uNklAdDpc2kbD8AzeeURlA/res_2_scott_0_3_3.npz", "https://replicate.delivery/pbxt/Dk3qZvee4hrMxEJbcW6fx3HfbZk3PkRGnSSSvyKr8Dq27piKB/gt_2_scott_0_3_3.npz" ], "started_at": "2024-04-09T08:00:42.059796Z", "status": "succeeded", "urls": { "get": "https://api.replicate.com/v1/predictions/bbtdxcbg3hrgg0cer7es72m7e4", "cancel": "https://api.replicate.com/v1/predictions/bbtdxcbg3hrgg0cer7es72m7e4/cancel" }, "version": "80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73" }
Generated in2024-04-09 08:00:42.977 | INFO | utils.other_tools_hf:print_exp_info:877 - {'a_encoder': None, 'a_fix_pre': False, 'a_pre_encoder': None, 'acc_weight': 0.0, 'additional_data': False, 'adv_weight': 20.0, 'ali_weight': 0.0, 'amsgrad': False, 'apex': False, 'asmr': 0.0, 'atcont': 0.0, 'atmr': 0.0, 'aud_prob': 1.0, 'audio_dims': 1, 'audio_f': 256, 'audio_fps': 16000, 'audio_norm': False, 'audio_rep': 'wave16k', 'audio_sr': 16000, 'batch_size': 64, 'beat_align': True, 'benchmark': True, 'cache_only': False, 'cache_path': './datasets/beat_cache/beat_smplx_en_emage_test/', 'cf': 0.0, 'ch': 1.0, 'cl': 1.0, 'clean_final_seconds': 0, 'clean_first_seconds': 0, 'config': './configs/emage_test_hf.yaml', 'csv_name': 'a2g_0', 'cu': 1.0, 'cudnn_enabled': True, 'd_lr_weight': 0.2, 'd_name': None, 'data_path': './EMAGE/test_sequences/', 'data_path_1': './EMAGE/', 'dataset': 'beat_testonly_hf', 'ddp': False, 'debug': False, 'decay_epochs': 9999, 'decay_rate': 0.1, 'decode_fusion': None, 'deterministic': True, 'disable_filtering': False, 'div_reg_weight': 0.0, 'dropout_prob': 0.3, 'e_name': 'VAESKConv', 'e_path': 'weights/AESKConv_240_100.bin', 'emo_rep': None, 'emotion_dims': 8, 'emotion_f': 0, 'epoch_stage': 0, 'epochs': 400, 'eval_model': 'motion_representation', 'f_encoder': 'null', 'f_fix_pre': False, 'f_pre_encoder': 'null', 'fac_prob': 1.0, 'facial_dims': 100, 'facial_f': 0, 'facial_fps': 15, 'facial_norm': False, 'facial_rep': 'smplxflame_30', 'fid_weight': 0.0, 'finger_net': 'original', 'freeze_wordembed': True, 'fsmr': 0.0, 'ftmr': 0.0, 'fusion_mode': 'sum', 'g_name': 'MAGE_Transformer', 'gap_weight': 0.0, 'gpus': [0], 'grad_norm': 0.99, 'hidden_size': 768, 'id_rep': 'onehot', 'input_context': 'both', 'is_train': True, 'ita_weight': 0.0, 'iwa_weight': 0.0, 'kld_aud_weight': 0.0, 'kld_fac_weight': 0.0, 'kld_weight': 0.0, 'l': 4, 'lf': 3.0, 'lh': 3.0, 'll': 3.0, 'loader_workers': 0, 'log_period': 10, 'loss_contrastive_neg_weight': 0.005, 'loss_contrastive_pos_weight': 0.2, 'loss_gan_weight': 5.0, 'loss_kld_weight': 0.1, 'loss_physical_weight': 0.0, 'loss_reg_weight': 0.05, 'loss_regression_weight': 70.0, 'lr_base': 0.0005, 'lr_min': 1e-07, 'lr_policy': 'step', 'lu': 3.0, 'm_decoder': None, 'm_encoder': 'null', 'm_fix_pre': False, 'm_pre_encoder': 'null', 'mean_pose_path': '/datasets/trinity/train/', 'model': 'emage_audio', 'momentum': 0.8, 'motion_f': 256, 'msmr': 0.0, 'mtmr': 0.0, 'multi_length_training': [1.0], 'n_layer': 1, 'n_poses': 34, 'n_pre_poses': 4, 'name': '0409_080042_emage_test_hf', 'nesterov': True, 'new_cache': True, 'no_adv_epoch': 999, 'notes': '', 'opt': 'adam', 'opt_betas': [0.5, 0.999], 'ori_joints': 'beat_smplx_joints', 'out_path': './outputs/audio2pose/', 'pos_encoding_type': 'sin', 'pos_prob': 1.0, 'pose_dims': 330, 'pose_fps': 30, 'pose_length': 64, 'pose_norm': False, 'pose_rep': 'smplxflame_30', 'pre_frames': 4, 'pre_type': 'zero', 'pretrain': False, 'project': 's2g', 'queue_size': 1024, 'random_seed': 2021, 'rec_aud_weight': 0.0, 'rec_fac_weight': 0.0, 'rec_pos_weight': 0.0, 'rec_txt_weight': 0.0, 'rec_ver_weight': 0.0, 'rec_weight': 1.0, 'render_concurrent_num': 1, 'render_tmp_img_filetype': 'bmp', 'render_video_fps': 30, 'render_video_height': 720, 'render_video_width': 1920, 'root_path': './', 'rot6d': True, 'sem_rep': None, 'sparse': 1, 'speaker_dims': 4, 'speaker_f': 0, 'speaker_id': 'onehot', 'stat': 'ts', 'std_pose_path': '/datasets/trinity/train/', 'stride': 20, 't_encoder': None, 't_fix_pre': False, 't_pre_encoder': None, 'tar_joints': 'beat_smplx_full', 'test_ckpt': './EMAGE/emage_audio_175.bin', 'test_data_path': '/datasets/trinity/test/', 'test_length': 64, 'test_period': 20, 'train_data_path': '/datasets/trinity/train/', 'train_trans': True, 'trainer': 'emage', 'training_speakers': [2], 'tsmr': 0.0, 'ttmr': 0.0, 'txt_prob': 1.0, 'use_aug': False, 'vae_codebook_size': 256, 'vae_grow': [1, 1, 2, 1], 'vae_layer': 4, 'vae_length': 240, 'vae_quantizer_lambda': 1.0, 'vae_test_dim': 330, 'vae_test_len': 32, 'vae_test_stride': 20, 'val_data_path': '/datasets/trinity/val/', 'variational': False, 'vel_weight': 0.0, 'warmup_epochs': 0, 'warmup_lr': 0.0005, 'wei_weight': 0.0, 'weight_decay': 0.0, 'word_cache': False, 'word_dims': 300, 'word_f': 0, 'word_index_num': 5793, 'word_rep': None, 'z_type': 'speaker'} 2024-04-09 08:00:42.977 | INFO | utils.other_tools_hf:print_exp_info:878 - # ------------ 0409_080042_emage_test_hf ----------- # 2024-04-09 08:00:42.977 | INFO | utils.other_tools_hf:print_exp_info:879 - PyTorch version: 2.2.0+cu121 2024-04-09 08:00:42.977 | INFO | utils.other_tools_hf:print_exp_info:880 - CUDA version: 12.1 2024-04-09 08:00:42.994 | INFO | utils.other_tools_hf:print_exp_info:881 - 1 GPUs 2024-04-09 08:00:42.994 | INFO | utils.other_tools_hf:print_exp_info:882 - Random Seed: 2021 /tmp/tmp_2wsoms8tmp.wav 2024-04-09 08:00:49.943 | INFO | dataloaders.beat_testonly_hf:build_cache:90 - Audio bit rate: 16000 2024-04-09 08:00:49.943 | INFO | dataloaders.beat_testonly_hf:build_cache:91 - Reading data './EMAGE/test_sequences/'... 2024-04-09 08:00:49.943 | INFO | dataloaders.beat_testonly_hf:build_cache:92 - Creating the dataset cache... 2024-04-09 08:00:49.945 | INFO | dataloaders.beat_testonly_hf:cache_generation:140 - # ---- Building cache for Pose dummy 2nd ---- # 2024-04-09 08:00:52.632 | INFO | dataloaders.beat_testonly_hf:cache_generation:214 - # ---- Building cache for Facial dummy 2nd and Pose dummy 2nd ---- # 2024-04-09 08:00:52.634 | INFO | dataloaders.beat_testonly_hf:cache_generation:260 - # ---- Building cache for Audio dummy 2nd and Pose dummy 2nd ---- # 22050 (108446,) (78691,) 1906 2024-04-09 08:00:52.636 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:517 - audio: 4s, pose: 63s, facial: 63s 2024-04-09 08:00:52.636 | WARNING | dataloaders.beat_testonly_hf:_sample_from_clip:521 - reduce to 4s, ignore 59s 2024-04-09 08:00:52.636 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:544 - pose from frame 0 to 120, length 120 2024-04-09 08:00:52.636 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:545 - 1 clips is expected with stride 120 2024-04-09 08:00:52.636 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:549 - audio from frame 0 to 64000, length 64000 2024-04-09 08:00:52.638 | INFO | dataloaders.beat_testonly_hf:cache_generation:478 - no. of samples: 1 2024-04-09 08:00:52.639 | INFO | dataloaders.beat_testonly_hf:cache_generation:483 - no. of excluded samples: 0 (0.0%) 2024-04-09 08:00:52.639 | INFO | predict:__init__:610 - Init test dataloader success 2024-04-09 08:00:53.217 | INFO | predict:__init__:623 - DataParallel( (module): MAGE_Transformer( (audio_pre_encoder_face): WavEncoder( (feat_extractor): Sequential( (0): BasicBlock( (conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (2): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (3): BasicBlock( (conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (4): BasicBlock( (conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (5): BasicBlock( (conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (audio_pre_encoder_body): WavEncoder( (feat_extractor): Sequential( (0): BasicBlock( (conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (2): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (3): BasicBlock( (conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (4): BasicBlock( (conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (5): BasicBlock( (conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (motion_encoder): VQEncoderV6( (main): Sequential( (0): Conv1d(337, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): ResBlock( (model): Sequential( (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) (3): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (4): LeakyReLU(negative_slope=0.2, inplace=True) (5): ResBlock( (model): Sequential( (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) (6): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (7): LeakyReLU(negative_slope=0.2, inplace=True) (8): ResBlock( (model): Sequential( (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) ) ) (feature2face): Linear(in_features=512, out_features=768, bias=True) (face2latent): Linear(in_features=768, out_features=256, bias=True) (transformer_de_layer): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) (face_decoder): TransformerDecoder( (layers): ModuleList( (0-3): 4 x TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (position_embeddings): PeriodicPositionalEncoding( (dropout): Dropout(p=0.1, inplace=False) ) (transformer_en_layer): TransformerEncoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) ) (motion_self_encoder): TransformerEncoder( (layers): ModuleList( (0): TransformerEncoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) ) ) ) (audio_feature2motion): Linear(in_features=256, out_features=768, bias=True) (feature2motion): Linear(in_features=256, out_features=768, bias=True) (bodyhints_face): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (bodyhints_body): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (motion2latent_upper): MLP( (mlp): Sequential( (0): Linear(in_features=768, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=768, bias=True) ) ) (motion2latent_hands): MLP( (mlp): Sequential( (0): Linear(in_features=768, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=768, bias=True) ) ) (motion2latent_lower): MLP( (mlp): Sequential( (0): Linear(in_features=768, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=768, bias=True) ) ) (wordhints_decoder): TransformerDecoder( (layers): ModuleList( (0-7): 8 x TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (upper_decoder): TransformerDecoder( (layers): ModuleList( (0): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (hands_decoder): TransformerDecoder( (layers): ModuleList( (0): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (lower_decoder): TransformerDecoder( (layers): ModuleList( (0): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (face_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (upper_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (hands_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (lower_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (motion_down_upper): Linear(in_features=768, out_features=256, bias=True) (motion_down_hands): Linear(in_features=768, out_features=256, bias=True) (motion_down_lower): Linear(in_features=768, out_features=256, bias=True) (spearker_encoder_body): Embedding(25, 768) (spearker_encoder_face): Embedding(25, 768) ) ) 2024-04-09 08:00:53.219 | INFO | predict:__init__:624 - init MAGE_Transformer success 2024-04-09 08:00:53.510 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 08:00:53.517 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 08:00:53.524 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 08:00:53.536 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 08:00:53.546 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 08:00:53.880 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for MAGE_Transformer generate_silent_videos concurrentNum=1 time=1712649654.8881638 subprocess_index=0 begin_ts=1712649655.6305969 processed 0 frames subprocess_index=0 render=3.80 all=4.04 begin_ts=1712649655.63 render_end_ts=1712649659.43 write_end_ts=1712649659.67 ffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers built with gcc 12 (Debian 12.2.0-14) configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared libavutil 57. 28.100 / 57. 28.100 libavcodec 59. 37.100 / 59. 37.100 libavformat 59. 27.100 / 59. 27.100 libavdevice 59. 7.100 / 59. 7.100 libavfilter 8. 44.100 / 8. 44.100 libswscale 6. 7.100 / 6. 7.100 libswresample 4. 7.100 / 4. 7.100 libpostproc 56. 6.100 / 56. 6.100 Input #0, image2, from './outputs/audio2pose/custom/hf//999/frame_%d.bmp': Duration: 00:00:02.00, start: 0.000000, bitrate: N/A Stream #0:0: Video: bmp, bgr24, 1920x720, 30 fps, 30 tbr, 30 tbn Stream mapping: Stream #0:0 -> #0:0 (bmp (native) -> h264 (libx264)) Press [q] to stop, [?] for help [libx264 @ 0x55679f7835c0] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2 [libx264 @ 0x55679f7835c0] profile High, level 4.0, 4:2:0, 8-bit [libx264 @ 0x55679f7835c0] 264 - core 164 r3095 baee400 - H.264/MPEG-4 AVC codec - Copyleft 2003-2022 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=6 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00 Output #0, mp4, to './outputs/audio2pose/custom/hf//999/silence_video.mp4': Metadata: encoder : Lavf59.27.100 Stream #0:0: Video: h264 (avc1 / 0x31637661), yuv420p(tv, progressive), 1920x720, q=2-31, 30 fps, 15360 tbn Metadata: encoder : Lavc59.37.100 libx264 Side data: cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A frame= 1 fps=0.0 q=0.0 size= 0kB time=00:00:00.00 bitrate=N/A speed=N/A frame= 60 fps=0.0 q=-1.0 Lsize= 181kB time=00:00:01.90 bitrate= 779.0kbits/s speed=3.28x video:179kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.856900% [libx264 @ 0x55679f7835c0] frame I:1 Avg QP:23.69 size: 19244 [libx264 @ 0x55679f7835c0] frame P:15 Avg QP:25.49 size: 6042 [libx264 @ 0x55679f7835c0] frame B:44 Avg QP:30.22 size: 1656 [libx264 @ 0x55679f7835c0] consecutive B-frames: 1.7% 0.0% 5.0% 93.3% [libx264 @ 0x55679f7835c0] mb I I16..4: 8.0% 83.0% 8.9% [libx264 @ 0x55679f7835c0] mb P I16..4: 0.4% 1.2% 0.5% P16..4: 10.2% 3.6% 1.6% 0.0% 0.0% skip:82.5% [libx264 @ 0x55679f7835c0] mb B I16..4: 0.0% 0.1% 0.0% B16..8: 6.4% 1.6% 0.3% direct: 0.3% skip:91.3% L0:43.6% L1:45.9% BI:10.5% [libx264 @ 0x55679f7835c0] 8x8 transform intra:76.9% inter:44.4% [libx264 @ 0x55679f7835c0] coded y,uvDC,uvAC intra: 19.7% 0.0% 0.0% inter: 1.5% 0.0% 0.0% [libx264 @ 0x55679f7835c0] i16 v,h,dc,p: 21% 19% 6% 54% [libx264 @ 0x55679f7835c0] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 61% 5% 24% 1% 1% 2% 1% 2% 1% [libx264 @ 0x55679f7835c0] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 41% 14% 14% 5% 5% 7% 5% 7% 3% [libx264 @ 0x55679f7835c0] i8c dc,h,v,p: 100% 0% 0% 0% [libx264 @ 0x55679f7835c0] Weighted P-Frames: Y:0.0% UV:0.0% [libx264 @ 0x55679f7835c0] ref P L0: 60.6% 8.9% 21.7% 8.9% [libx264 @ 0x55679f7835c0] ref B L0: 84.4% 12.2% 3.4% [libx264 @ 0x55679f7835c0] ref B L1: 95.5% 4.5% [libx264 @ 0x55679f7835c0] kb/s:731.05 Video conversion successful. Output file: ./outputs/audio2pose/custom/hf//999/silence_video.mp4 ffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers built with gcc 12 (Debian 12.2.0-14) configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared libavutil 57. 28.100 / 57. 28.100 libavcodec 59. 37.100 / 59. 37.100 libavformat 59. 27.100 / 59. 27.100 libavdevice 59. 7.100 / 59. 7.100 libavfilter 8. 44.100 / 8. 44.100 libswscale 6. 7.100 / 6. 7.100 libswresample 4. 7.100 / 4. 7.100 libpostproc 56. 6.100 / 56. 6.100 Input #0, mov,mp4,m4a,3gp,3g2,mj2, from './outputs/audio2pose/custom/hf//999/silence_video.mp4': Metadata: major_brand : isom minor_version : 512 compatible_brands: isomiso2avc1mp41 encoder : Lavf59.27.100 Duration: 00:00:02.00, start: 0.000000, bitrate: 740 kb/s Stream #0:0[0x1](und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, 733 kb/s, 30 fps, 30 tbr, 15360 tbn (default) Metadata: handler_name : VideoHandler vendor_id : [0][0][0][0] encoder : Lavc59.37.100 libx264 Guessed Channel Layout for Input Stream #1.0 : mono Input #1, wav, from '/tmp/tmp_2wsoms8tmp.wav': Duration: 00:00:04.92, bitrate: 384 kb/s Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, mono, s16, 384 kb/s Stream mapping: Stream #0:0 -> #0:0 (copy) Stream #1:0 -> #0:1 (pcm_s16le (native) -> aac (native)) Press [q] to stop, [?] for help Output #0, mp4, to './outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4': Metadata: major_brand : isom minor_version : 512 compatible_brands: isomiso2avc1mp41 encoder : Lavf59.27.100 Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, q=2-31, 733 kb/s, 30 fps, 30 tbr, 15360 tbn (default) Metadata: handler_name : VideoHandler vendor_id : [0][0][0][0] encoder : Lavc59.37.100 libx264 Stream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 24000 Hz, mono, fltp, 69 kb/s Metadata: encoder : Lavc59.37.100 aac frame= 0 fps=0.0 q=-1.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x frame= 60 fps=0.0 q=-1.0 Lsize= 200kB time=00:00:02.04 bitrate= 798.9kbits/s speed= 128x video:179kB audio:18kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 1.498796% [aac @ 0x55a5bccc5480] Qavg: 651.317 Video with audio generated successfully: ./outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4
Prediction
camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73IDrr8ja8ndw1rgg0cer7gahachn4StatusSucceededSourceWebHardwareA40Total durationCreatedInput
- audio_path
- Video Player is loading.Current Time 00:00:000/Duration 00:00:000Loaded: 0%Stream Type LIVERemaining Time -00:00:0001x
- Chapters
- descriptions off, selected
- captions settings, opens captions settings dialog
- captions off, selected
This is a modal window.
Beginning of dialog window. Escape will cancel and close the window.
End of dialog window.
{ "audio_path": "https://replicate.delivery/pbxt/KiSGX34szVb7honQI7PAmO8tzllLueRcr73z0RM2qaV6C2ZW/ash.wav" }
Install Replicate’s Node.js client library:npm install replicate
Import and set up the client:import Replicate from "replicate"; const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN, });
Run camenduru/emage using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.
const output = await replicate.run( "camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73", { input: { audio_path: "https://replicate.delivery/pbxt/KiSGX34szVb7honQI7PAmO8tzllLueRcr73z0RM2qaV6C2ZW/ash.wav" } } ); // To access the file URL: console.log(output[0].url()); //=> "http://example.com" // To write the file to disk: fs.writeFile("my-image.png", output[0]);
To learn more, take a look at the guide on getting started with Node.js.
Install Replicate’s Python client library:pip install replicate
Import the client:import replicate
Run camenduru/emage using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.
output = replicate.run( "camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73", input={ "audio_path": "https://replicate.delivery/pbxt/KiSGX34szVb7honQI7PAmO8tzllLueRcr73z0RM2qaV6C2ZW/ash.wav" } ) print(output)
To learn more, take a look at the guide on getting started with Python.
Run camenduru/emage using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.
curl -s -X POST \ -H "Authorization: Bearer $REPLICATE_API_TOKEN" \ -H "Content-Type: application/json" \ -H "Prefer: wait" \ -d $'{ "version": "camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73", "input": { "audio_path": "https://replicate.delivery/pbxt/KiSGX34szVb7honQI7PAmO8tzllLueRcr73z0RM2qaV6C2ZW/ash.wav" } }' \ https://api.replicate.com/v1/predictions
To learn more, take a look at Replicate’s HTTP API reference docs.
Output
{ "completed_at": "2024-04-09T08:02:12.606308Z", "created_at": "2024-04-09T08:01:51.072000Z", "data_removed": false, "error": null, "id": "rr8ja8ndw1rgg0cer7gahachn4", "input": { "audio_path": "https://replicate.delivery/pbxt/KiSGX34szVb7honQI7PAmO8tzllLueRcr73z0RM2qaV6C2ZW/ash.wav" }, "logs": "2024-04-09 08:01:51.795 | INFO | utils.other_tools_hf:print_exp_info:877 - {'a_encoder': None,\n'a_fix_pre': False,\n'a_pre_encoder': None,\n'acc_weight': 0.0,\n'additional_data': False,\n'adv_weight': 20.0,\n'ali_weight': 0.0,\n'amsgrad': False,\n'apex': False,\n'asmr': 0.0,\n'atcont': 0.0,\n'atmr': 0.0,\n'aud_prob': 1.0,\n'audio_dims': 1,\n'audio_f': 256,\n'audio_fps': 16000,\n'audio_norm': False,\n'audio_rep': 'wave16k',\n'audio_sr': 16000,\n'batch_size': 64,\n'beat_align': True,\n'benchmark': True,\n'cache_only': False,\n'cache_path': './datasets/beat_cache/beat_smplx_en_emage_test/',\n'cf': 0.0,\n'ch': 1.0,\n'cl': 1.0,\n'clean_final_seconds': 0,\n'clean_first_seconds': 0,\n'config': './configs/emage_test_hf.yaml',\n'csv_name': 'a2g_0',\n'cu': 1.0,\n'cudnn_enabled': True,\n'd_lr_weight': 0.2,\n'd_name': None,\n'data_path': './EMAGE/test_sequences/',\n'data_path_1': './EMAGE/',\n'dataset': 'beat_testonly_hf',\n'ddp': False,\n'debug': False,\n'decay_epochs': 9999,\n'decay_rate': 0.1,\n'decode_fusion': None,\n'deterministic': True,\n'disable_filtering': False,\n'div_reg_weight': 0.0,\n'dropout_prob': 0.3,\n'e_name': 'VAESKConv',\n'e_path': 'weights/AESKConv_240_100.bin',\n'emo_rep': None,\n'emotion_dims': 8,\n'emotion_f': 0,\n'epoch_stage': 0,\n'epochs': 400,\n'eval_model': 'motion_representation',\n'f_encoder': 'null',\n'f_fix_pre': False,\n'f_pre_encoder': 'null',\n'fac_prob': 1.0,\n'facial_dims': 100,\n'facial_f': 0,\n'facial_fps': 15,\n'facial_norm': False,\n'facial_rep': 'smplxflame_30',\n'fid_weight': 0.0,\n'finger_net': 'original',\n'freeze_wordembed': True,\n'fsmr': 0.0,\n'ftmr': 0.0,\n'fusion_mode': 'sum',\n'g_name': 'MAGE_Transformer',\n'gap_weight': 0.0,\n'gpus': [0],\n'grad_norm': 0.99,\n'hidden_size': 768,\n'id_rep': 'onehot',\n'input_context': 'both',\n'is_train': True,\n'ita_weight': 0.0,\n'iwa_weight': 0.0,\n'kld_aud_weight': 0.0,\n'kld_fac_weight': 0.0,\n'kld_weight': 0.0,\n'l': 4,\n'lf': 3.0,\n'lh': 3.0,\n'll': 3.0,\n'loader_workers': 0,\n'log_period': 10,\n'loss_contrastive_neg_weight': 0.005,\n'loss_contrastive_pos_weight': 0.2,\n'loss_gan_weight': 5.0,\n'loss_kld_weight': 0.1,\n'loss_physical_weight': 0.0,\n'loss_reg_weight': 0.05,\n'loss_regression_weight': 70.0,\n'lr_base': 0.0005,\n'lr_min': 1e-07,\n'lr_policy': 'step',\n'lu': 3.0,\n'm_decoder': None,\n'm_encoder': 'null',\n'm_fix_pre': False,\n'm_pre_encoder': 'null',\n'mean_pose_path': '/datasets/trinity/train/',\n'model': 'emage_audio',\n'momentum': 0.8,\n'motion_f': 256,\n'msmr': 0.0,\n'mtmr': 0.0,\n'multi_length_training': [1.0],\n'n_layer': 1,\n'n_poses': 34,\n'n_pre_poses': 4,\n'name': '0409_080151_emage_test_hf',\n'nesterov': True,\n'new_cache': True,\n'no_adv_epoch': 999,\n'notes': '',\n'opt': 'adam',\n'opt_betas': [0.5, 0.999],\n'ori_joints': 'beat_smplx_joints',\n'out_path': './outputs/audio2pose/',\n'pos_encoding_type': 'sin',\n'pos_prob': 1.0,\n'pose_dims': 330,\n'pose_fps': 30,\n'pose_length': 64,\n'pose_norm': False,\n'pose_rep': 'smplxflame_30',\n'pre_frames': 4,\n'pre_type': 'zero',\n'pretrain': False,\n'project': 's2g',\n'queue_size': 1024,\n'random_seed': 2021,\n'rec_aud_weight': 0.0,\n'rec_fac_weight': 0.0,\n'rec_pos_weight': 0.0,\n'rec_txt_weight': 0.0,\n'rec_ver_weight': 0.0,\n'rec_weight': 1.0,\n'render_concurrent_num': 1,\n'render_tmp_img_filetype': 'bmp',\n'render_video_fps': 30,\n'render_video_height': 720,\n'render_video_width': 1920,\n'root_path': './',\n'rot6d': True,\n'sem_rep': None,\n'sparse': 1,\n'speaker_dims': 4,\n'speaker_f': 0,\n'speaker_id': 'onehot',\n'stat': 'ts',\n'std_pose_path': '/datasets/trinity/train/',\n'stride': 20,\n't_encoder': None,\n't_fix_pre': False,\n't_pre_encoder': None,\n'tar_joints': 'beat_smplx_full',\n'test_ckpt': './EMAGE/emage_audio_175.bin',\n'test_data_path': '/datasets/trinity/test/',\n'test_length': 64,\n'test_period': 20,\n'train_data_path': '/datasets/trinity/train/',\n'train_trans': True,\n'trainer': 'emage',\n'training_speakers': [2],\n'tsmr': 0.0,\n'ttmr': 0.0,\n'txt_prob': 1.0,\n'use_aug': False,\n'vae_codebook_size': 256,\n'vae_grow': [1, 1, 2, 1],\n'vae_layer': 4,\n'vae_length': 240,\n'vae_quantizer_lambda': 1.0,\n'vae_test_dim': 330,\n'vae_test_len': 32,\n'vae_test_stride': 20,\n'val_data_path': '/datasets/trinity/val/',\n'variational': False,\n'vel_weight': 0.0,\n'warmup_epochs': 0,\n'warmup_lr': 0.0005,\n'wei_weight': 0.0,\n'weight_decay': 0.0,\n'word_cache': False,\n'word_dims': 300,\n'word_f': 0,\n'word_index_num': 5793,\n'word_rep': None,\n'z_type': 'speaker'}\n2024-04-09 08:01:51.795 | INFO | utils.other_tools_hf:print_exp_info:878 - # ------------ 0409_080151_emage_test_hf ----------- #\n2024-04-09 08:01:51.796 | INFO | utils.other_tools_hf:print_exp_info:879 - PyTorch version: 2.2.0+cu121\n2024-04-09 08:01:51.796 | INFO | utils.other_tools_hf:print_exp_info:880 - CUDA version: 12.1\n2024-04-09 08:01:51.796 | INFO | utils.other_tools_hf:print_exp_info:881 - 1 GPUs\n2024-04-09 08:01:51.796 | INFO | utils.other_tools_hf:print_exp_info:882 - Random Seed: 2021\n/tmp/tmpvn62w1e0ash.wav\n2024-04-09 08:01:51.958 | INFO | dataloaders.beat_testonly_hf:build_cache:90 - Audio bit rate: 16000\n2024-04-09 08:01:51.958 | INFO | dataloaders.beat_testonly_hf:build_cache:91 - Reading data './EMAGE/test_sequences/'...\n2024-04-09 08:01:51.958 | INFO | dataloaders.beat_testonly_hf:build_cache:92 - Creating the dataset cache...\n2024-04-09 08:01:51.960 | INFO | dataloaders.beat_testonly_hf:cache_generation:140 - # ---- Building cache for Pose dummy 2nd ---- #\n2024-04-09 08:01:54.684 | INFO | dataloaders.beat_testonly_hf:cache_generation:214 - # ---- Building cache for Facial dummy 2nd and Pose dummy 2nd ---- #\n2024-04-09 08:01:54.686 | INFO | dataloaders.beat_testonly_hf:cache_generation:260 - # ---- Building cache for Audio dummy 2nd and Pose dummy 2nd ---- #\n22050\n(169201,)\n(122777,)\n1906\n2024-04-09 08:01:54.689 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:517 - audio: 7s, pose: 63s, facial: 63s\n2024-04-09 08:01:54.689 | WARNING | dataloaders.beat_testonly_hf:_sample_from_clip:521 - reduce to 7s, ignore 56s\n2024-04-09 08:01:54.689 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:544 - pose from frame 0 to 210, length 210\n2024-04-09 08:01:54.689 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:545 - 1 clips is expected with stride 210\n2024-04-09 08:01:54.689 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:549 - audio from frame 0 to 112000, length 112000\n2024-04-09 08:01:54.694 | INFO | dataloaders.beat_testonly_hf:cache_generation:478 - no. of samples: 1\n2024-04-09 08:01:54.694 | INFO | dataloaders.beat_testonly_hf:cache_generation:483 - no. of excluded samples: 0 (0.0%)\n2024-04-09 08:01:54.699 | INFO | predict:__init__:610 - Init test dataloader success\n2024-04-09 08:01:55.163 | INFO | predict:__init__:623 - DataParallel(\n(module): MAGE_Transformer(\n(audio_pre_encoder_face): WavEncoder(\n(feat_extractor): Sequential(\n(0): BasicBlock(\n(conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(1): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(2): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(3): BasicBlock(\n(conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(4): BasicBlock(\n(conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(5): BasicBlock(\n(conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n)\n)\n(audio_pre_encoder_body): WavEncoder(\n(feat_extractor): Sequential(\n(0): BasicBlock(\n(conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(1): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(2): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(3): BasicBlock(\n(conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(4): BasicBlock(\n(conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(5): BasicBlock(\n(conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n)\n)\n(motion_encoder): VQEncoderV6(\n(main): Sequential(\n(0): Conv1d(337, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): ResBlock(\n(model): Sequential(\n(0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n)\n)\n(3): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(4): LeakyReLU(negative_slope=0.2, inplace=True)\n(5): ResBlock(\n(model): Sequential(\n(0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n)\n)\n(6): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(7): LeakyReLU(negative_slope=0.2, inplace=True)\n(8): ResBlock(\n(model): Sequential(\n(0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n)\n)\n)\n)\n(feature2face): Linear(in_features=512, out_features=768, bias=True)\n(face2latent): Linear(in_features=768, out_features=256, bias=True)\n(transformer_de_layer): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n(face_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0-3): 4 x TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(position_embeddings): PeriodicPositionalEncoding(\n(dropout): Dropout(p=0.1, inplace=False)\n)\n(transformer_en_layer): TransformerEncoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n)\n(motion_self_encoder): TransformerEncoder(\n(layers): ModuleList(\n(0): TransformerEncoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(audio_feature2motion): Linear(in_features=256, out_features=768, bias=True)\n(feature2motion): Linear(in_features=256, out_features=768, bias=True)\n(bodyhints_face): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(bodyhints_body): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(motion2latent_upper): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=768, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=768, bias=True)\n)\n)\n(motion2latent_hands): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=768, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=768, bias=True)\n)\n)\n(motion2latent_lower): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=768, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=768, bias=True)\n)\n)\n(wordhints_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0-7): 8 x TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(upper_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(hands_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(lower_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(face_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(upper_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(hands_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(lower_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(motion_down_upper): Linear(in_features=768, out_features=256, bias=True)\n(motion_down_hands): Linear(in_features=768, out_features=256, bias=True)\n(motion_down_lower): Linear(in_features=768, out_features=256, bias=True)\n(spearker_encoder_body): Embedding(25, 768)\n(spearker_encoder_face): Embedding(25, 768)\n)\n)\n2024-04-09 08:01:55.165 | INFO | predict:__init__:624 - init MAGE_Transformer success\n2024-04-09 08:01:55.454 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 08:01:55.461 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 08:01:55.468 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 08:01:55.484 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 08:01:55.495 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 08:01:55.826 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for MAGE_Transformer\ngenerate_silent_videos concurrentNum=1 time=1712649716.3468359\nsubprocess_index=0 begin_ts=1712649717.0764742\nprocessed 0 frames\nprocessed 100 frames\nsubprocess_index=0 render=10.95 all=11.61 begin_ts=1712649717.08 render_end_ts=1712649728.02 write_end_ts=1712649728.69\nffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers\nbuilt with gcc 12 (Debian 12.2.0-14)\nconfiguration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared\nlibavutil 57. 28.100 / 57. 28.100\nlibavcodec 59. 37.100 / 59. 37.100\nlibavformat 59. 27.100 / 59. 27.100\nlibavdevice 59. 7.100 / 59. 7.100\nlibavfilter 8. 44.100 / 8. 44.100\nlibswscale 6. 7.100 / 6. 7.100\nlibswresample 4. 7.100 / 4. 7.100\nlibpostproc 56. 6.100 / 56. 6.100\nInput #0, image2, from './outputs/audio2pose/custom/hf//999/frame_%d.bmp':\nDuration: 00:00:06.00, start: 0.000000, bitrate: N/A\nStream #0:0: Video: bmp, bgr24, 1920x720, 30 fps, 30 tbr, 30 tbn\nStream mapping:\nStream #0:0 -> #0:0 (bmp (native) -> h264 (libx264))\nPress [q] to stop, [?] for help\n[libx264 @ 0x55eaff5e2540] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2\n[libx264 @ 0x55eaff5e2540] profile High, level 4.0, 4:2:0, 8-bit\n[libx264 @ 0x55eaff5e2540] 264 - core 164 r3095 baee400 - H.264/MPEG-4 AVC codec - Copyleft 2003-2022 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=6 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00\nOutput #0, mp4, to './outputs/audio2pose/custom/hf//999/silence_video.mp4':\nMetadata:\nencoder : Lavf59.27.100\nStream #0:0: Video: h264 (avc1 / 0x31637661), yuv420p(tv, progressive), 1920x720, q=2-31, 30 fps, 15360 tbn\nMetadata:\nencoder : Lavc59.37.100 libx264\nSide data:\ncpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A\nframe= 1 fps=0.0 q=0.0 size= 0kB time=00:00:00.00 bitrate=N/A speed=N/A\nframe= 78 fps=0.0 q=29.0 size= 0kB time=00:00:00.83 bitrate= 0.5kbits/s speed=1.66x\nframe= 142 fps=140 q=29.0 size= 256kB time=00:00:02.96 bitrate= 707.0kbits/s speed=2.93x\nframe= 180 fps=115 q=-1.0 Lsize= 544kB time=00:00:05.90 bitrate= 755.9kbits/s speed=3.78x\nvideo:541kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.543236%\n[libx264 @ 0x55eaff5e2540] frame I:1 Avg QP:23.34 size: 19447\n[libx264 @ 0x55eaff5e2540] frame P:45 Avg QP:24.95 size: 6561\n[libx264 @ 0x55eaff5e2540] frame B:134 Avg QP:29.79 size: 1784\n[libx264 @ 0x55eaff5e2540] consecutive B-frames: 0.6% 0.0% 1.7% 97.8%\n[libx264 @ 0x55eaff5e2540] mb I I16..4: 10.2% 81.3% 8.5%\n[libx264 @ 0x55eaff5e2540] mb P I16..4: 0.4% 1.3% 0.5% P16..4: 10.8% 3.7% 1.7% 0.0% 0.0% skip:81.5%\n[libx264 @ 0x55eaff5e2540] mb B I16..4: 0.0% 0.1% 0.0% B16..8: 7.2% 1.6% 0.3% direct: 0.2% skip:90.4% L0:44.8% L1:45.8% BI: 9.3%\n[libx264 @ 0x55eaff5e2540] 8x8 transform intra:69.7% inter:44.7%\n[libx264 @ 0x55eaff5e2540] coded y,uvDC,uvAC intra: 25.7% 0.0% 0.0% inter: 1.6% 0.0% 0.0%\n[libx264 @ 0x55eaff5e2540] i16 v,h,dc,p: 25% 23% 11% 41%\n[libx264 @ 0x55eaff5e2540] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 45% 8% 34% 2% 2% 2% 3% 3% 2%\n[libx264 @ 0x55eaff5e2540] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 35% 15% 17% 5% 5% 6% 5% 8% 4%\n[libx264 @ 0x55eaff5e2540] i8c dc,h,v,p: 100% 0% 0% 0%\n[libx264 @ 0x55eaff5e2540] Weighted P-Frames: Y:0.0% UV:0.0%\n[libx264 @ 0x55eaff5e2540] ref P L0: 56.0% 8.9% 22.0% 13.1%\n[libx264 @ 0x55eaff5e2540] ref B L0: 82.5% 13.2% 4.3%\n[libx264 @ 0x55eaff5e2540] ref B L1: 95.0% 5.0%\n[libx264 @ 0x55eaff5e2540] kb/s:738.35\nVideo conversion successful. Output file: ./outputs/audio2pose/custom/hf//999/silence_video.mp4\nffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers\nbuilt with gcc 12 (Debian 12.2.0-14)\nconfiguration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared\nlibavutil 57. 28.100 / 57. 28.100\nlibavcodec 59. 37.100 / 59. 37.100\nlibavformat 59. 27.100 / 59. 27.100\nlibavdevice 59. 7.100 / 59. 7.100\nlibavfilter 8. 44.100 / 8. 44.100\nlibswscale 6. 7.100 / 6. 7.100\nlibswresample 4. 7.100 / 4. 7.100\nlibpostproc 56. 6.100 / 56. 6.100\nInput #0, mov,mp4,m4a,3gp,3g2,mj2, from './outputs/audio2pose/custom/hf//999/silence_video.mp4':\nMetadata:\nmajor_brand : isom\nminor_version : 512\ncompatible_brands: isomiso2avc1mp41\nencoder : Lavf59.27.100\nDuration: 00:00:06.00, start: 0.000000, bitrate: 743 kb/s\nStream #0:0[0x1](und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, 739 kb/s, 30 fps, 30 tbr, 15360 tbn (default)\nMetadata:\nhandler_name : VideoHandler\nvendor_id : [0][0][0][0]\nencoder : Lavc59.37.100 libx264\nGuessed Channel Layout for Input Stream #1.0 : stereo\nInput #1, wav, from '/tmp/tmpvn62w1e0ash.wav':\nMetadata:\nencoder : Lavf58.76.100\nDuration: 00:00:07.67, bitrate: 1536 kb/s\nStream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, stereo, s16, 1536 kb/s\nStream mapping:\nStream #0:0 -> #0:0 (copy)\nStream #1:0 -> #0:1 (pcm_s16le (native) -> aac (native))\nPress [q] to stop, [?] for help\nOutput #0, mp4, to './outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4':\nMetadata:\nmajor_brand : isom\nminor_version : 512\ncompatible_brands: isomiso2avc1mp41\nencoder : Lavf59.27.100\nStream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, q=2-31, 739 kb/s, 30 fps, 30 tbr, 15360 tbn (default)\nMetadata:\nhandler_name : VideoHandler\nvendor_id : [0][0][0][0]\nencoder : Lavc59.37.100 libx264\nStream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s\nMetadata:\nencoder : Lavc59.37.100 aac\nframe= 0 fps=0.0 q=-1.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x\nframe= 180 fps=0.0 q=-1.0 Lsize= 643kB time=00:00:06.01 bitrate= 876.2kbits/s speed=36.6x\nvideo:541kB audio:94kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 1.230881%\n[aac @ 0x564cefe44180] Qavg: 1877.850\nVideo with audio generated successfully: ./outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4", "metrics": { "predict_time": 21.495684, "total_time": 21.534308 }, "output": [ "https://replicate.delivery/pbxt/MygmTljOo6LoOJ64b8YqJHMJZZQsbjIWA3cix5xVWjmAoKqE/res_2_scott_0_3_3.mp4", "https://replicate.delivery/pbxt/oFll4sh07eQ4Aaucuwojs4YVMl3er054mP74AklGjmYEgqoSA/res_2_scott_0_3_3.npz", "https://replicate.delivery/pbxt/htmA0zeG0kRlMSTwlPgO21vhye41FRzReH3ZnEdLHDTJAVRlA/gt_2_scott_0_3_3.npz" ], "started_at": "2024-04-09T08:01:51.110624Z", "status": "succeeded", "urls": { "get": "https://api.replicate.com/v1/predictions/rr8ja8ndw1rgg0cer7gahachn4", "cancel": "https://api.replicate.com/v1/predictions/rr8ja8ndw1rgg0cer7gahachn4/cancel" }, "version": "80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73" }
Generated in2024-04-09 08:01:51.795 | INFO | utils.other_tools_hf:print_exp_info:877 - {'a_encoder': None, 'a_fix_pre': False, 'a_pre_encoder': None, 'acc_weight': 0.0, 'additional_data': False, 'adv_weight': 20.0, 'ali_weight': 0.0, 'amsgrad': False, 'apex': False, 'asmr': 0.0, 'atcont': 0.0, 'atmr': 0.0, 'aud_prob': 1.0, 'audio_dims': 1, 'audio_f': 256, 'audio_fps': 16000, 'audio_norm': False, 'audio_rep': 'wave16k', 'audio_sr': 16000, 'batch_size': 64, 'beat_align': True, 'benchmark': True, 'cache_only': False, 'cache_path': './datasets/beat_cache/beat_smplx_en_emage_test/', 'cf': 0.0, 'ch': 1.0, 'cl': 1.0, 'clean_final_seconds': 0, 'clean_first_seconds': 0, 'config': './configs/emage_test_hf.yaml', 'csv_name': 'a2g_0', 'cu': 1.0, 'cudnn_enabled': True, 'd_lr_weight': 0.2, 'd_name': None, 'data_path': './EMAGE/test_sequences/', 'data_path_1': './EMAGE/', 'dataset': 'beat_testonly_hf', 'ddp': False, 'debug': False, 'decay_epochs': 9999, 'decay_rate': 0.1, 'decode_fusion': None, 'deterministic': True, 'disable_filtering': False, 'div_reg_weight': 0.0, 'dropout_prob': 0.3, 'e_name': 'VAESKConv', 'e_path': 'weights/AESKConv_240_100.bin', 'emo_rep': None, 'emotion_dims': 8, 'emotion_f': 0, 'epoch_stage': 0, 'epochs': 400, 'eval_model': 'motion_representation', 'f_encoder': 'null', 'f_fix_pre': False, 'f_pre_encoder': 'null', 'fac_prob': 1.0, 'facial_dims': 100, 'facial_f': 0, 'facial_fps': 15, 'facial_norm': False, 'facial_rep': 'smplxflame_30', 'fid_weight': 0.0, 'finger_net': 'original', 'freeze_wordembed': True, 'fsmr': 0.0, 'ftmr': 0.0, 'fusion_mode': 'sum', 'g_name': 'MAGE_Transformer', 'gap_weight': 0.0, 'gpus': [0], 'grad_norm': 0.99, 'hidden_size': 768, 'id_rep': 'onehot', 'input_context': 'both', 'is_train': True, 'ita_weight': 0.0, 'iwa_weight': 0.0, 'kld_aud_weight': 0.0, 'kld_fac_weight': 0.0, 'kld_weight': 0.0, 'l': 4, 'lf': 3.0, 'lh': 3.0, 'll': 3.0, 'loader_workers': 0, 'log_period': 10, 'loss_contrastive_neg_weight': 0.005, 'loss_contrastive_pos_weight': 0.2, 'loss_gan_weight': 5.0, 'loss_kld_weight': 0.1, 'loss_physical_weight': 0.0, 'loss_reg_weight': 0.05, 'loss_regression_weight': 70.0, 'lr_base': 0.0005, 'lr_min': 1e-07, 'lr_policy': 'step', 'lu': 3.0, 'm_decoder': None, 'm_encoder': 'null', 'm_fix_pre': False, 'm_pre_encoder': 'null', 'mean_pose_path': '/datasets/trinity/train/', 'model': 'emage_audio', 'momentum': 0.8, 'motion_f': 256, 'msmr': 0.0, 'mtmr': 0.0, 'multi_length_training': [1.0], 'n_layer': 1, 'n_poses': 34, 'n_pre_poses': 4, 'name': '0409_080151_emage_test_hf', 'nesterov': True, 'new_cache': True, 'no_adv_epoch': 999, 'notes': '', 'opt': 'adam', 'opt_betas': [0.5, 0.999], 'ori_joints': 'beat_smplx_joints', 'out_path': './outputs/audio2pose/', 'pos_encoding_type': 'sin', 'pos_prob': 1.0, 'pose_dims': 330, 'pose_fps': 30, 'pose_length': 64, 'pose_norm': False, 'pose_rep': 'smplxflame_30', 'pre_frames': 4, 'pre_type': 'zero', 'pretrain': False, 'project': 's2g', 'queue_size': 1024, 'random_seed': 2021, 'rec_aud_weight': 0.0, 'rec_fac_weight': 0.0, 'rec_pos_weight': 0.0, 'rec_txt_weight': 0.0, 'rec_ver_weight': 0.0, 'rec_weight': 1.0, 'render_concurrent_num': 1, 'render_tmp_img_filetype': 'bmp', 'render_video_fps': 30, 'render_video_height': 720, 'render_video_width': 1920, 'root_path': './', 'rot6d': True, 'sem_rep': None, 'sparse': 1, 'speaker_dims': 4, 'speaker_f': 0, 'speaker_id': 'onehot', 'stat': 'ts', 'std_pose_path': '/datasets/trinity/train/', 'stride': 20, 't_encoder': None, 't_fix_pre': False, 't_pre_encoder': None, 'tar_joints': 'beat_smplx_full', 'test_ckpt': './EMAGE/emage_audio_175.bin', 'test_data_path': '/datasets/trinity/test/', 'test_length': 64, 'test_period': 20, 'train_data_path': '/datasets/trinity/train/', 'train_trans': True, 'trainer': 'emage', 'training_speakers': [2], 'tsmr': 0.0, 'ttmr': 0.0, 'txt_prob': 1.0, 'use_aug': False, 'vae_codebook_size': 256, 'vae_grow': [1, 1, 2, 1], 'vae_layer': 4, 'vae_length': 240, 'vae_quantizer_lambda': 1.0, 'vae_test_dim': 330, 'vae_test_len': 32, 'vae_test_stride': 20, 'val_data_path': '/datasets/trinity/val/', 'variational': False, 'vel_weight': 0.0, 'warmup_epochs': 0, 'warmup_lr': 0.0005, 'wei_weight': 0.0, 'weight_decay': 0.0, 'word_cache': False, 'word_dims': 300, 'word_f': 0, 'word_index_num': 5793, 'word_rep': None, 'z_type': 'speaker'} 2024-04-09 08:01:51.795 | INFO | utils.other_tools_hf:print_exp_info:878 - # ------------ 0409_080151_emage_test_hf ----------- # 2024-04-09 08:01:51.796 | INFO | utils.other_tools_hf:print_exp_info:879 - PyTorch version: 2.2.0+cu121 2024-04-09 08:01:51.796 | INFO | utils.other_tools_hf:print_exp_info:880 - CUDA version: 12.1 2024-04-09 08:01:51.796 | INFO | utils.other_tools_hf:print_exp_info:881 - 1 GPUs 2024-04-09 08:01:51.796 | INFO | utils.other_tools_hf:print_exp_info:882 - Random Seed: 2021 /tmp/tmpvn62w1e0ash.wav 2024-04-09 08:01:51.958 | INFO | dataloaders.beat_testonly_hf:build_cache:90 - Audio bit rate: 16000 2024-04-09 08:01:51.958 | INFO | dataloaders.beat_testonly_hf:build_cache:91 - Reading data './EMAGE/test_sequences/'... 2024-04-09 08:01:51.958 | INFO | dataloaders.beat_testonly_hf:build_cache:92 - Creating the dataset cache... 2024-04-09 08:01:51.960 | INFO | dataloaders.beat_testonly_hf:cache_generation:140 - # ---- Building cache for Pose dummy 2nd ---- # 2024-04-09 08:01:54.684 | INFO | dataloaders.beat_testonly_hf:cache_generation:214 - # ---- Building cache for Facial dummy 2nd and Pose dummy 2nd ---- # 2024-04-09 08:01:54.686 | INFO | dataloaders.beat_testonly_hf:cache_generation:260 - # ---- Building cache for Audio dummy 2nd and Pose dummy 2nd ---- # 22050 (169201,) (122777,) 1906 2024-04-09 08:01:54.689 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:517 - audio: 7s, pose: 63s, facial: 63s 2024-04-09 08:01:54.689 | WARNING | dataloaders.beat_testonly_hf:_sample_from_clip:521 - reduce to 7s, ignore 56s 2024-04-09 08:01:54.689 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:544 - pose from frame 0 to 210, length 210 2024-04-09 08:01:54.689 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:545 - 1 clips is expected with stride 210 2024-04-09 08:01:54.689 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:549 - audio from frame 0 to 112000, length 112000 2024-04-09 08:01:54.694 | INFO | dataloaders.beat_testonly_hf:cache_generation:478 - no. of samples: 1 2024-04-09 08:01:54.694 | INFO | dataloaders.beat_testonly_hf:cache_generation:483 - no. of excluded samples: 0 (0.0%) 2024-04-09 08:01:54.699 | INFO | predict:__init__:610 - Init test dataloader success 2024-04-09 08:01:55.163 | INFO | predict:__init__:623 - DataParallel( (module): MAGE_Transformer( (audio_pre_encoder_face): WavEncoder( (feat_extractor): Sequential( (0): BasicBlock( (conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (2): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (3): BasicBlock( (conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (4): BasicBlock( (conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (5): BasicBlock( (conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (audio_pre_encoder_body): WavEncoder( (feat_extractor): Sequential( (0): BasicBlock( (conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (2): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (3): BasicBlock( (conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (4): BasicBlock( (conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (5): BasicBlock( (conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (motion_encoder): VQEncoderV6( (main): Sequential( (0): Conv1d(337, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): ResBlock( (model): Sequential( (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) (3): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (4): LeakyReLU(negative_slope=0.2, inplace=True) (5): ResBlock( (model): Sequential( (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) (6): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (7): LeakyReLU(negative_slope=0.2, inplace=True) (8): ResBlock( (model): Sequential( (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) ) ) (feature2face): Linear(in_features=512, out_features=768, bias=True) (face2latent): Linear(in_features=768, out_features=256, bias=True) (transformer_de_layer): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) (face_decoder): TransformerDecoder( (layers): ModuleList( (0-3): 4 x TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (position_embeddings): PeriodicPositionalEncoding( (dropout): Dropout(p=0.1, inplace=False) ) (transformer_en_layer): TransformerEncoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) ) (motion_self_encoder): TransformerEncoder( (layers): ModuleList( (0): TransformerEncoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) ) ) ) (audio_feature2motion): Linear(in_features=256, out_features=768, bias=True) (feature2motion): Linear(in_features=256, out_features=768, bias=True) (bodyhints_face): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (bodyhints_body): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (motion2latent_upper): MLP( (mlp): Sequential( (0): Linear(in_features=768, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=768, bias=True) ) ) (motion2latent_hands): MLP( (mlp): Sequential( (0): Linear(in_features=768, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=768, bias=True) ) ) (motion2latent_lower): MLP( (mlp): Sequential( (0): Linear(in_features=768, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=768, bias=True) ) ) (wordhints_decoder): TransformerDecoder( (layers): ModuleList( (0-7): 8 x TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (upper_decoder): TransformerDecoder( (layers): ModuleList( (0): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (hands_decoder): TransformerDecoder( (layers): ModuleList( (0): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (lower_decoder): TransformerDecoder( (layers): ModuleList( (0): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (face_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (upper_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (hands_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (lower_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (motion_down_upper): Linear(in_features=768, out_features=256, bias=True) (motion_down_hands): Linear(in_features=768, out_features=256, bias=True) (motion_down_lower): Linear(in_features=768, out_features=256, bias=True) (spearker_encoder_body): Embedding(25, 768) (spearker_encoder_face): Embedding(25, 768) ) ) 2024-04-09 08:01:55.165 | INFO | predict:__init__:624 - init MAGE_Transformer success 2024-04-09 08:01:55.454 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 08:01:55.461 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 08:01:55.468 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 08:01:55.484 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 08:01:55.495 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 08:01:55.826 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for MAGE_Transformer generate_silent_videos concurrentNum=1 time=1712649716.3468359 subprocess_index=0 begin_ts=1712649717.0764742 processed 0 frames processed 100 frames subprocess_index=0 render=10.95 all=11.61 begin_ts=1712649717.08 render_end_ts=1712649728.02 write_end_ts=1712649728.69 ffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers built with gcc 12 (Debian 12.2.0-14) configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared libavutil 57. 28.100 / 57. 28.100 libavcodec 59. 37.100 / 59. 37.100 libavformat 59. 27.100 / 59. 27.100 libavdevice 59. 7.100 / 59. 7.100 libavfilter 8. 44.100 / 8. 44.100 libswscale 6. 7.100 / 6. 7.100 libswresample 4. 7.100 / 4. 7.100 libpostproc 56. 6.100 / 56. 6.100 Input #0, image2, from './outputs/audio2pose/custom/hf//999/frame_%d.bmp': Duration: 00:00:06.00, start: 0.000000, bitrate: N/A Stream #0:0: Video: bmp, bgr24, 1920x720, 30 fps, 30 tbr, 30 tbn Stream mapping: Stream #0:0 -> #0:0 (bmp (native) -> h264 (libx264)) Press [q] to stop, [?] for help [libx264 @ 0x55eaff5e2540] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2 [libx264 @ 0x55eaff5e2540] profile High, level 4.0, 4:2:0, 8-bit [libx264 @ 0x55eaff5e2540] 264 - core 164 r3095 baee400 - H.264/MPEG-4 AVC codec - Copyleft 2003-2022 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=6 lookahead_threads=1 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00 Output #0, mp4, to './outputs/audio2pose/custom/hf//999/silence_video.mp4': Metadata: encoder : Lavf59.27.100 Stream #0:0: Video: h264 (avc1 / 0x31637661), yuv420p(tv, progressive), 1920x720, q=2-31, 30 fps, 15360 tbn Metadata: encoder : Lavc59.37.100 libx264 Side data: cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A frame= 1 fps=0.0 q=0.0 size= 0kB time=00:00:00.00 bitrate=N/A speed=N/A frame= 78 fps=0.0 q=29.0 size= 0kB time=00:00:00.83 bitrate= 0.5kbits/s speed=1.66x frame= 142 fps=140 q=29.0 size= 256kB time=00:00:02.96 bitrate= 707.0kbits/s speed=2.93x frame= 180 fps=115 q=-1.0 Lsize= 544kB time=00:00:05.90 bitrate= 755.9kbits/s speed=3.78x video:541kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.543236% [libx264 @ 0x55eaff5e2540] frame I:1 Avg QP:23.34 size: 19447 [libx264 @ 0x55eaff5e2540] frame P:45 Avg QP:24.95 size: 6561 [libx264 @ 0x55eaff5e2540] frame B:134 Avg QP:29.79 size: 1784 [libx264 @ 0x55eaff5e2540] consecutive B-frames: 0.6% 0.0% 1.7% 97.8% [libx264 @ 0x55eaff5e2540] mb I I16..4: 10.2% 81.3% 8.5% [libx264 @ 0x55eaff5e2540] mb P I16..4: 0.4% 1.3% 0.5% P16..4: 10.8% 3.7% 1.7% 0.0% 0.0% skip:81.5% [libx264 @ 0x55eaff5e2540] mb B I16..4: 0.0% 0.1% 0.0% B16..8: 7.2% 1.6% 0.3% direct: 0.2% skip:90.4% L0:44.8% L1:45.8% BI: 9.3% [libx264 @ 0x55eaff5e2540] 8x8 transform intra:69.7% inter:44.7% [libx264 @ 0x55eaff5e2540] coded y,uvDC,uvAC intra: 25.7% 0.0% 0.0% inter: 1.6% 0.0% 0.0% [libx264 @ 0x55eaff5e2540] i16 v,h,dc,p: 25% 23% 11% 41% [libx264 @ 0x55eaff5e2540] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 45% 8% 34% 2% 2% 2% 3% 3% 2% [libx264 @ 0x55eaff5e2540] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 35% 15% 17% 5% 5% 6% 5% 8% 4% [libx264 @ 0x55eaff5e2540] i8c dc,h,v,p: 100% 0% 0% 0% [libx264 @ 0x55eaff5e2540] Weighted P-Frames: Y:0.0% UV:0.0% [libx264 @ 0x55eaff5e2540] ref P L0: 56.0% 8.9% 22.0% 13.1% [libx264 @ 0x55eaff5e2540] ref B L0: 82.5% 13.2% 4.3% [libx264 @ 0x55eaff5e2540] ref B L1: 95.0% 5.0% [libx264 @ 0x55eaff5e2540] kb/s:738.35 Video conversion successful. Output file: ./outputs/audio2pose/custom/hf//999/silence_video.mp4 ffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers built with gcc 12 (Debian 12.2.0-14) configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared libavutil 57. 28.100 / 57. 28.100 libavcodec 59. 37.100 / 59. 37.100 libavformat 59. 27.100 / 59. 27.100 libavdevice 59. 7.100 / 59. 7.100 libavfilter 8. 44.100 / 8. 44.100 libswscale 6. 7.100 / 6. 7.100 libswresample 4. 7.100 / 4. 7.100 libpostproc 56. 6.100 / 56. 6.100 Input #0, mov,mp4,m4a,3gp,3g2,mj2, from './outputs/audio2pose/custom/hf//999/silence_video.mp4': Metadata: major_brand : isom minor_version : 512 compatible_brands: isomiso2avc1mp41 encoder : Lavf59.27.100 Duration: 00:00:06.00, start: 0.000000, bitrate: 743 kb/s Stream #0:0[0x1](und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, 739 kb/s, 30 fps, 30 tbr, 15360 tbn (default) Metadata: handler_name : VideoHandler vendor_id : [0][0][0][0] encoder : Lavc59.37.100 libx264 Guessed Channel Layout for Input Stream #1.0 : stereo Input #1, wav, from '/tmp/tmpvn62w1e0ash.wav': Metadata: encoder : Lavf58.76.100 Duration: 00:00:07.67, bitrate: 1536 kb/s Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, stereo, s16, 1536 kb/s Stream mapping: Stream #0:0 -> #0:0 (copy) Stream #1:0 -> #0:1 (pcm_s16le (native) -> aac (native)) Press [q] to stop, [?] for help Output #0, mp4, to './outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4': Metadata: major_brand : isom minor_version : 512 compatible_brands: isomiso2avc1mp41 encoder : Lavf59.27.100 Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, q=2-31, 739 kb/s, 30 fps, 30 tbr, 15360 tbn (default) Metadata: handler_name : VideoHandler vendor_id : [0][0][0][0] encoder : Lavc59.37.100 libx264 Stream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s Metadata: encoder : Lavc59.37.100 aac frame= 0 fps=0.0 q=-1.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x frame= 180 fps=0.0 q=-1.0 Lsize= 643kB time=00:00:06.01 bitrate= 876.2kbits/s speed=36.6x video:541kB audio:94kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 1.230881% [aac @ 0x564cefe44180] Qavg: 1877.850 Video with audio generated successfully: ./outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4
Prediction
camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73ID5p2kenh3axrga0ceraavh09p5cStatusSucceededSourceWebHardwareT4Total durationCreatedInput
- audio_path
- Video Player is loading.Current Time 00:00:000/Duration 00:00:000Loaded: 0%Stream Type LIVERemaining Time -00:00:0001x
- Chapters
- descriptions off, selected
- captions settings, opens captions settings dialog
- captions off, selected
This is a modal window.
Beginning of dialog window. Escape will cancel and close the window.
End of dialog window.
{ "audio_path": "https://replicate.delivery/pbxt/KiSGX34szVb7honQI7PAmO8tzllLueRcr73z0RM2qaV6C2ZW/ash.wav" }
Install Replicate’s Node.js client library:npm install replicate
Import and set up the client:import Replicate from "replicate"; const replicate = new Replicate({ auth: process.env.REPLICATE_API_TOKEN, });
Run camenduru/emage using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.
const output = await replicate.run( "camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73", { input: { audio_path: "https://replicate.delivery/pbxt/KiSGX34szVb7honQI7PAmO8tzllLueRcr73z0RM2qaV6C2ZW/ash.wav" } } ); // To access the file URL: console.log(output[0].url()); //=> "http://example.com" // To write the file to disk: fs.writeFile("my-image.png", output[0]);
To learn more, take a look at the guide on getting started with Node.js.
Install Replicate’s Python client library:pip install replicate
Import the client:import replicate
Run camenduru/emage using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.
output = replicate.run( "camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73", input={ "audio_path": "https://replicate.delivery/pbxt/KiSGX34szVb7honQI7PAmO8tzllLueRcr73z0RM2qaV6C2ZW/ash.wav" } ) print(output)
To learn more, take a look at the guide on getting started with Python.
Run camenduru/emage using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.
curl -s -X POST \ -H "Authorization: Bearer $REPLICATE_API_TOKEN" \ -H "Content-Type: application/json" \ -H "Prefer: wait" \ -d $'{ "version": "camenduru/emage:80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73", "input": { "audio_path": "https://replicate.delivery/pbxt/KiSGX34szVb7honQI7PAmO8tzllLueRcr73z0RM2qaV6C2ZW/ash.wav" } }' \ https://api.replicate.com/v1/predictions
To learn more, take a look at Replicate’s HTTP API reference docs.
Output
{ "completed_at": "2024-04-09T11:23:53.191866Z", "created_at": "2024-04-09T11:18:57.623000Z", "data_removed": false, "error": null, "id": "5p2kenh3axrga0ceraavh09p5c", "input": { "audio_path": "https://replicate.delivery/pbxt/KiSGX34szVb7honQI7PAmO8tzllLueRcr73z0RM2qaV6C2ZW/ash.wav" }, "logs": "2024-04-09 11:21:35.381 | INFO | utils.other_tools_hf:print_exp_info:877 - {'a_encoder': None,\n'a_fix_pre': False,\n'a_pre_encoder': None,\n'acc_weight': 0.0,\n'additional_data': False,\n'adv_weight': 20.0,\n'ali_weight': 0.0,\n'amsgrad': False,\n'apex': False,\n'asmr': 0.0,\n'atcont': 0.0,\n'atmr': 0.0,\n'aud_prob': 1.0,\n'audio_dims': 1,\n'audio_f': 256,\n'audio_fps': 16000,\n'audio_norm': False,\n'audio_rep': 'wave16k',\n'audio_sr': 16000,\n'batch_size': 64,\n'beat_align': True,\n'benchmark': True,\n'cache_only': False,\n'cache_path': './datasets/beat_cache/beat_smplx_en_emage_test/',\n'cf': 0.0,\n'ch': 1.0,\n'cl': 1.0,\n'clean_final_seconds': 0,\n'clean_first_seconds': 0,\n'config': './configs/emage_test_hf.yaml',\n'csv_name': 'a2g_0',\n'cu': 1.0,\n'cudnn_enabled': True,\n'd_lr_weight': 0.2,\n'd_name': None,\n'data_path': './EMAGE/test_sequences/',\n'data_path_1': './EMAGE/',\n'dataset': 'beat_testonly_hf',\n'ddp': False,\n'debug': False,\n'decay_epochs': 9999,\n'decay_rate': 0.1,\n'decode_fusion': None,\n'deterministic': True,\n'disable_filtering': False,\n'div_reg_weight': 0.0,\n'dropout_prob': 0.3,\n'e_name': 'VAESKConv',\n'e_path': 'weights/AESKConv_240_100.bin',\n'emo_rep': None,\n'emotion_dims': 8,\n'emotion_f': 0,\n'epoch_stage': 0,\n'epochs': 400,\n'eval_model': 'motion_representation',\n'f_encoder': 'null',\n'f_fix_pre': False,\n'f_pre_encoder': 'null',\n'fac_prob': 1.0,\n'facial_dims': 100,\n'facial_f': 0,\n'facial_fps': 15,\n'facial_norm': False,\n'facial_rep': 'smplxflame_30',\n'fid_weight': 0.0,\n'finger_net': 'original',\n'freeze_wordembed': True,\n'fsmr': 0.0,\n'ftmr': 0.0,\n'fusion_mode': 'sum',\n'g_name': 'MAGE_Transformer',\n'gap_weight': 0.0,\n'gpus': [0],\n'grad_norm': 0.99,\n'hidden_size': 768,\n'id_rep': 'onehot',\n'input_context': 'both',\n'is_train': True,\n'ita_weight': 0.0,\n'iwa_weight': 0.0,\n'kld_aud_weight': 0.0,\n'kld_fac_weight': 0.0,\n'kld_weight': 0.0,\n'l': 4,\n'lf': 3.0,\n'lh': 3.0,\n'll': 3.0,\n'loader_workers': 0,\n'log_period': 10,\n'loss_contrastive_neg_weight': 0.005,\n'loss_contrastive_pos_weight': 0.2,\n'loss_gan_weight': 5.0,\n'loss_kld_weight': 0.1,\n'loss_physical_weight': 0.0,\n'loss_reg_weight': 0.05,\n'loss_regression_weight': 70.0,\n'lr_base': 0.0005,\n'lr_min': 1e-07,\n'lr_policy': 'step',\n'lu': 3.0,\n'm_decoder': None,\n'm_encoder': 'null',\n'm_fix_pre': False,\n'm_pre_encoder': 'null',\n'mean_pose_path': '/datasets/trinity/train/',\n'model': 'emage_audio',\n'momentum': 0.8,\n'motion_f': 256,\n'msmr': 0.0,\n'mtmr': 0.0,\n'multi_length_training': [1.0],\n'n_layer': 1,\n'n_poses': 34,\n'n_pre_poses': 4,\n'name': '0409_112135_emage_test_hf',\n'nesterov': True,\n'new_cache': True,\n'no_adv_epoch': 999,\n'notes': '',\n'opt': 'adam',\n'opt_betas': [0.5, 0.999],\n'ori_joints': 'beat_smplx_joints',\n'out_path': './outputs/audio2pose/',\n'pos_encoding_type': 'sin',\n'pos_prob': 1.0,\n'pose_dims': 330,\n'pose_fps': 30,\n'pose_length': 64,\n'pose_norm': False,\n'pose_rep': 'smplxflame_30',\n'pre_frames': 4,\n'pre_type': 'zero',\n'pretrain': False,\n'project': 's2g',\n'queue_size': 1024,\n'random_seed': 2021,\n'rec_aud_weight': 0.0,\n'rec_fac_weight': 0.0,\n'rec_pos_weight': 0.0,\n'rec_txt_weight': 0.0,\n'rec_ver_weight': 0.0,\n'rec_weight': 1.0,\n'render_concurrent_num': 1,\n'render_tmp_img_filetype': 'bmp',\n'render_video_fps': 30,\n'render_video_height': 720,\n'render_video_width': 1920,\n'root_path': './',\n'rot6d': True,\n'sem_rep': None,\n'sparse': 1,\n'speaker_dims': 4,\n'speaker_f': 0,\n'speaker_id': 'onehot',\n'stat': 'ts',\n'std_pose_path': '/datasets/trinity/train/',\n'stride': 20,\n't_encoder': None,\n't_fix_pre': False,\n't_pre_encoder': None,\n'tar_joints': 'beat_smplx_full',\n'test_ckpt': './EMAGE/emage_audio_175.bin',\n'test_data_path': '/datasets/trinity/test/',\n'test_length': 64,\n'test_period': 20,\n'train_data_path': '/datasets/trinity/train/',\n'train_trans': True,\n'trainer': 'emage',\n'training_speakers': [2],\n'tsmr': 0.0,\n'ttmr': 0.0,\n'txt_prob': 1.0,\n'use_aug': False,\n'vae_codebook_size': 256,\n'vae_grow': [1, 1, 2, 1],\n'vae_layer': 4,\n'vae_length': 240,\n'vae_quantizer_lambda': 1.0,\n'vae_test_dim': 330,\n'vae_test_len': 32,\n'vae_test_stride': 20,\n'val_data_path': '/datasets/trinity/val/',\n'variational': False,\n'vel_weight': 0.0,\n'warmup_epochs': 0,\n'warmup_lr': 0.0005,\n'wei_weight': 0.0,\n'weight_decay': 0.0,\n'word_cache': False,\n'word_dims': 300,\n'word_f': 0,\n'word_index_num': 5793,\n'word_rep': None,\n'z_type': 'speaker'}\n2024-04-09 11:21:35.381 | INFO | utils.other_tools_hf:print_exp_info:878 - # ------------ 0409_112135_emage_test_hf ----------- #\n2024-04-09 11:21:35.382 | INFO | utils.other_tools_hf:print_exp_info:879 - PyTorch version: 2.2.0+cu121\n2024-04-09 11:21:35.382 | INFO | utils.other_tools_hf:print_exp_info:880 - CUDA version: 12.1\n2024-04-09 11:21:35.411 | INFO | utils.other_tools_hf:print_exp_info:881 - 1 GPUs\n2024-04-09 11:21:35.411 | INFO | utils.other_tools_hf:print_exp_info:882 - Random Seed: 2021\n/tmp/tmpxrjlchkhash.wav\n2024-04-09 11:21:44.407 | INFO | dataloaders.beat_testonly_hf:build_cache:90 - Audio bit rate: 16000\n2024-04-09 11:21:44.408 | INFO | dataloaders.beat_testonly_hf:build_cache:91 - Reading data './EMAGE/test_sequences/'...\n2024-04-09 11:21:44.408 | INFO | dataloaders.beat_testonly_hf:build_cache:92 - Creating the dataset cache...\n2024-04-09 11:21:44.410 | INFO | dataloaders.beat_testonly_hf:cache_generation:140 - # ---- Building cache for Pose dummy 2nd ---- #\n2024-04-09 11:21:48.702 | INFO | dataloaders.beat_testonly_hf:cache_generation:214 - # ---- Building cache for Facial dummy 2nd and Pose dummy 2nd ---- #\n2024-04-09 11:21:48.705 | INFO | dataloaders.beat_testonly_hf:cache_generation:260 - # ---- Building cache for Audio dummy 2nd and Pose dummy 2nd ---- #\n22050\n(169201,)\n(122777,)\n1906\n2024-04-09 11:21:48.711 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:517 - audio: 7s, pose: 63s, facial: 63s\n2024-04-09 11:21:48.712 | WARNING | dataloaders.beat_testonly_hf:_sample_from_clip:521 - reduce to 7s, ignore 56s\n2024-04-09 11:21:48.712 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:544 - pose from frame 0 to 210, length 210\n2024-04-09 11:21:48.712 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:545 - 1 clips is expected with stride 210\n2024-04-09 11:21:48.712 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:549 - audio from frame 0 to 112000, length 112000\n2024-04-09 11:21:49.020 | INFO | dataloaders.beat_testonly_hf:cache_generation:478 - no. of samples: 1\n2024-04-09 11:21:49.020 | INFO | dataloaders.beat_testonly_hf:cache_generation:483 - no. of excluded samples: 0 (0.0%)\n2024-04-09 11:21:49.021 | INFO | predict:__init__:610 - Init test dataloader success\n2024-04-09 11:21:50.586 | INFO | predict:__init__:623 - DataParallel(\n(module): MAGE_Transformer(\n(audio_pre_encoder_face): WavEncoder(\n(feat_extractor): Sequential(\n(0): BasicBlock(\n(conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(1): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(2): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(3): BasicBlock(\n(conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(4): BasicBlock(\n(conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(5): BasicBlock(\n(conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n)\n)\n(audio_pre_encoder_body): WavEncoder(\n(feat_extractor): Sequential(\n(0): BasicBlock(\n(conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(1): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 64, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(2): BasicBlock(\n(conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(3): BasicBlock(\n(conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(64, 128, kernel_size=(15,), stride=(6,))\n(1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n(4): BasicBlock(\n(conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n)\n(5): BasicBlock(\n(conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act1): LeakyReLU(negative_slope=0.01, inplace=True)\n(conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,))\n(bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n(act2): LeakyReLU(negative_slope=0.01, inplace=True)\n(downsample): Sequential(\n(0): Conv1d(128, 256, kernel_size=(15,), stride=(3,))\n(1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n)\n)\n)\n)\n(motion_encoder): VQEncoderV6(\n(main): Sequential(\n(0): Conv1d(337, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): ResBlock(\n(model): Sequential(\n(0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n)\n)\n(3): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(4): LeakyReLU(negative_slope=0.2, inplace=True)\n(5): ResBlock(\n(model): Sequential(\n(0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n)\n)\n(6): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(7): LeakyReLU(negative_slope=0.2, inplace=True)\n(8): ResBlock(\n(model): Sequential(\n(0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))\n)\n)\n)\n)\n(feature2face): Linear(in_features=512, out_features=768, bias=True)\n(face2latent): Linear(in_features=768, out_features=256, bias=True)\n(transformer_de_layer): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n(face_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0-3): 4 x TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(position_embeddings): PeriodicPositionalEncoding(\n(dropout): Dropout(p=0.1, inplace=False)\n)\n(transformer_en_layer): TransformerEncoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n)\n(motion_self_encoder): TransformerEncoder(\n(layers): ModuleList(\n(0): TransformerEncoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(audio_feature2motion): Linear(in_features=256, out_features=768, bias=True)\n(feature2motion): Linear(in_features=256, out_features=768, bias=True)\n(bodyhints_face): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(bodyhints_body): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(motion2latent_upper): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=768, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=768, bias=True)\n)\n)\n(motion2latent_hands): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=768, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=768, bias=True)\n)\n)\n(motion2latent_lower): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=768, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=768, bias=True)\n)\n)\n(wordhints_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0-7): 8 x TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(upper_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(hands_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(lower_decoder): TransformerDecoder(\n(layers): ModuleList(\n(0): TransformerDecoderLayer(\n(self_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(multihead_attn): MultiheadAttention(\n(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)\n)\n(linear1): Linear(in_features=768, out_features=1536, bias=True)\n(dropout): Dropout(p=0.1, inplace=False)\n(linear2): Linear(in_features=1536, out_features=768, bias=True)\n(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n(dropout1): Dropout(p=0.1, inplace=False)\n(dropout2): Dropout(p=0.1, inplace=False)\n(dropout3): Dropout(p=0.1, inplace=False)\n)\n)\n)\n(face_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(upper_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(hands_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(lower_classifier): MLP(\n(mlp): Sequential(\n(0): Linear(in_features=256, out_features=768, bias=True)\n(1): LeakyReLU(negative_slope=0.2, inplace=True)\n(2): Linear(in_features=768, out_features=256, bias=True)\n)\n)\n(motion_down_upper): Linear(in_features=768, out_features=256, bias=True)\n(motion_down_hands): Linear(in_features=768, out_features=256, bias=True)\n(motion_down_lower): Linear(in_features=768, out_features=256, bias=True)\n(spearker_encoder_body): Embedding(25, 768)\n(spearker_encoder_face): Embedding(25, 768)\n)\n)\n2024-04-09 11:21:50.590 | INFO | predict:__init__:624 - init MAGE_Transformer success\n2024-04-09 11:21:51.714 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 11:21:51.730 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 11:21:51.749 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 11:21:51.781 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 11:21:51.813 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv\n2024-04-09 11:21:52.376 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for MAGE_Transformer\ngenerate_silent_videos concurrentNum=1 time=1712661715.849715\nsubprocess_index=0 begin_ts=1712661717.4629269\nprocessed 0 frames\nprocessed 100 frames\nsubprocess_index=0 render=107.60 all=109.60 begin_ts=1712661717.46 render_end_ts=1712661825.06 write_end_ts=1712661827.06\nffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers\nbuilt with gcc 12 (Debian 12.2.0-14)\nconfiguration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared\nlibavutil 57. 28.100 / 57. 28.100\nlibavcodec 59. 37.100 / 59. 37.100\nlibavformat 59. 27.100 / 59. 27.100\nlibavdevice 59. 7.100 / 59. 7.100\nlibavfilter 8. 44.100 / 8. 44.100\nlibswscale 6. 7.100 / 6. 7.100\nlibswresample 4. 7.100 / 4. 7.100\nlibpostproc 56. 6.100 / 56. 6.100\nInput #0, image2, from './outputs/audio2pose/custom/hf//999/frame_%d.bmp':\nDuration: 00:00:06.00, start: 0.000000, bitrate: N/A\nStream #0:0: Video: bmp, bgr24, 1920x720, 30 fps, 30 tbr, 30 tbn\nStream mapping:\nStream #0:0 -> #0:0 (bmp (native) -> h264 (libx264))\nPress [q] to stop, [?] for help\n[libx264 @ 0x5d27f62a4100] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2\n[libx264 @ 0x5d27f62a4100] profile High, level 4.0, 4:2:0, 8-bit\n[libx264 @ 0x5d27f62a4100] 264 - core 164 r3095 baee400 - H.264/MPEG-4 AVC codec - Copyleft 2003-2022 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=22 lookahead_threads=3 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00\nOutput #0, mp4, to './outputs/audio2pose/custom/hf//999/silence_video.mp4':\nMetadata:\nencoder : Lavf59.27.100\nStream #0:0: Video: h264 (avc1 / 0x31637661), yuv420p(tv, progressive), 1920x720, q=2-31, 30 fps, 15360 tbn\nMetadata:\nencoder : Lavc59.37.100 libx264\nSide data:\ncpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A\nframe= 1 fps=0.0 q=0.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x\nframe= 37 fps=0.0 q=0.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x\nframe= 64 fps= 53 q=0.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x\nframe= 96 fps= 54 q=29.0 size= 0kB time=00:00:00.90 bitrate= 0.4kbits/s speed=0.511x\nframe= 128 fps= 56 q=29.0 size= 0kB time=00:00:01.96 bitrate= 0.2kbits/s speed=0.867x\nframe= 160 fps= 58 q=29.0 size= 256kB time=00:00:03.03 bitrate= 691.5kbits/s speed=1.09x\nframe= 170 fps= 49 q=29.0 size= 256kB time=00:00:03.36 bitrate= 623.0kbits/s speed=0.965x\nframe= 180 fps= 44 q=-1.0 Lsize= 544kB time=00:00:05.90 bitrate= 754.7kbits/s speed=1.45x\nvideo:541kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.544116%\n[libx264 @ 0x5d27f62a4100] frame I:1 Avg QP:22.59 size: 19472\n[libx264 @ 0x5d27f62a4100] frame P:45 Avg QP:24.95 size: 6556\n[libx264 @ 0x5d27f62a4100] frame B:134 Avg QP:29.78 size: 1779\n[libx264 @ 0x5d27f62a4100] consecutive B-frames: 0.6% 0.0% 1.7% 97.8%\n[libx264 @ 0x5d27f62a4100] mb I I16..4: 12.2% 79.2% 8.6%\n[libx264 @ 0x5d27f62a4100] mb P I16..4: 0.4% 1.3% 0.5% P16..4: 11.0% 3.8% 1.7% 0.0% 0.0% skip:81.3%\n[libx264 @ 0x5d27f62a4100] mb B I16..4: 0.0% 0.1% 0.0% B16..8: 7.3% 1.6% 0.3% direct: 0.2% skip:90.4% L0:45.0% L1:45.5% BI: 9.5%\n[libx264 @ 0x5d27f62a4100] 8x8 transform intra:68.2% inter:45.3%\n[libx264 @ 0x5d27f62a4100] coded y,uvDC,uvAC intra: 26.1% 0.0% 0.0% inter: 1.6% 0.0% 0.0%\n[libx264 @ 0x5d27f62a4100] i16 v,h,dc,p: 30% 20% 11% 40%\n[libx264 @ 0x5d27f62a4100] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 46% 9% 32% 2% 2% 3% 3% 3% 1%\n[libx264 @ 0x5d27f62a4100] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 35% 15% 17% 6% 5% 6% 5% 8% 4%\n[libx264 @ 0x5d27f62a4100] i8c dc,h,v,p: 100% 0% 0% 0%\n[libx264 @ 0x5d27f62a4100] Weighted P-Frames: Y:0.0% UV:0.0%\n[libx264 @ 0x5d27f62a4100] ref P L0: 57.1% 8.6% 21.5% 12.8%\n[libx264 @ 0x5d27f62a4100] ref B L0: 82.7% 13.0% 4.2%\n[libx264 @ 0x5d27f62a4100] ref B L1: 95.2% 4.8%\n[libx264 @ 0x5d27f62a4100] kb/s:737.16\nVideo conversion successful. Output file: ./outputs/audio2pose/custom/hf//999/silence_video.mp4\nffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers\nbuilt with gcc 12 (Debian 12.2.0-14)\nconfiguration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared\nlibavutil 57. 28.100 / 57. 28.100\nlibavcodec 59. 37.100 / 59. 37.100\nlibavformat 59. 27.100 / 59. 27.100\nlibavdevice 59. 7.100 / 59. 7.100\nlibavfilter 8. 44.100 / 8. 44.100\nlibswscale 6. 7.100 / 6. 7.100\nlibswresample 4. 7.100 / 4. 7.100\nlibpostproc 56. 6.100 / 56. 6.100\nInput #0, mov,mp4,m4a,3gp,3g2,mj2, from './outputs/audio2pose/custom/hf//999/silence_video.mp4':\nMetadata:\nmajor_brand : isom\nminor_version : 512\ncompatible_brands: isomiso2avc1mp41\nencoder : Lavf59.27.100\nDuration: 00:00:06.00, start: 0.000000, bitrate: 742 kb/s\nStream #0:0[0x1](und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, 738 kb/s, 30 fps, 30 tbr, 15360 tbn (default)\nMetadata:\nhandler_name : VideoHandler\nvendor_id : [0][0][0][0]\nencoder : Lavc59.37.100 libx264\nGuessed Channel Layout for Input Stream #1.0 : stereo\nInput #1, wav, from '/tmp/tmpxrjlchkhash.wav':\nMetadata:\nencoder : Lavf58.76.100\nDuration: 00:00:07.67, bitrate: 1536 kb/s\nStream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, stereo, s16, 1536 kb/s\nStream mapping:\nStream #0:0 -> #0:0 (copy)\nStream #1:0 -> #0:1 (pcm_s16le (native) -> aac (native))\nPress [q] to stop, [?] for help\nOutput #0, mp4, to './outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4':\nMetadata:\nmajor_brand : isom\nminor_version : 512\ncompatible_brands: isomiso2avc1mp41\nencoder : Lavf59.27.100\nStream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, q=2-31, 738 kb/s, 30 fps, 30 tbr, 15360 tbn (default)\nMetadata:\nhandler_name : VideoHandler\nvendor_id : [0][0][0][0]\nencoder : Lavc59.37.100 libx264\nStream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s\nMetadata:\nencoder : Lavc59.37.100 aac\nframe= 0 fps=0.0 q=-1.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x\nframe= 171 fps=0.0 q=-1.0 size= 512kB time=00:00:05.63 bitrate= 744.8kbits/s speed=10.8x\nframe= 180 fps=0.0 q=-1.0 Lsize= 643kB time=00:00:06.01 bitrate= 875.0kbits/s speed= 11x\nvideo:541kB audio:94kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 1.232579%\n[aac @ 0x5bacce29f280] Qavg: 1877.850\nVideo with audio generated successfully: ./outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4", "metrics": { "predict_time": 137.97153, "total_time": 295.568866 }, "output": [ "https://replicate.delivery/pbxt/HQW5AiNniWJSCZpWvkaD3hrSD77zT4KAnypDJfr6h1GkuWUJA/res_2_scott_0_3_3.mp4", "https://replicate.delivery/pbxt/sWV99f9cyDSffoBd9Fiqfd2sYe4GrkKeqaJd7BhBKeVIkuWUJA/res_2_scott_0_3_3.npz", "https://replicate.delivery/pbxt/9P6J4MamWNKhDRbiAO1DetUJY5Wmz6uaoiK2ndYUSQxkuWUJA/gt_2_scott_0_3_3.npz" ], "started_at": "2024-04-09T11:21:35.220336Z", "status": "succeeded", "urls": { "get": "https://api.replicate.com/v1/predictions/5p2kenh3axrga0ceraavh09p5c", "cancel": "https://api.replicate.com/v1/predictions/5p2kenh3axrga0ceraavh09p5c/cancel" }, "version": "80048634fe2d4edf63687229c69d7b868204a569e91df880ba418eb8a7485e73" }
Generated in2024-04-09 11:21:35.381 | INFO | utils.other_tools_hf:print_exp_info:877 - {'a_encoder': None, 'a_fix_pre': False, 'a_pre_encoder': None, 'acc_weight': 0.0, 'additional_data': False, 'adv_weight': 20.0, 'ali_weight': 0.0, 'amsgrad': False, 'apex': False, 'asmr': 0.0, 'atcont': 0.0, 'atmr': 0.0, 'aud_prob': 1.0, 'audio_dims': 1, 'audio_f': 256, 'audio_fps': 16000, 'audio_norm': False, 'audio_rep': 'wave16k', 'audio_sr': 16000, 'batch_size': 64, 'beat_align': True, 'benchmark': True, 'cache_only': False, 'cache_path': './datasets/beat_cache/beat_smplx_en_emage_test/', 'cf': 0.0, 'ch': 1.0, 'cl': 1.0, 'clean_final_seconds': 0, 'clean_first_seconds': 0, 'config': './configs/emage_test_hf.yaml', 'csv_name': 'a2g_0', 'cu': 1.0, 'cudnn_enabled': True, 'd_lr_weight': 0.2, 'd_name': None, 'data_path': './EMAGE/test_sequences/', 'data_path_1': './EMAGE/', 'dataset': 'beat_testonly_hf', 'ddp': False, 'debug': False, 'decay_epochs': 9999, 'decay_rate': 0.1, 'decode_fusion': None, 'deterministic': True, 'disable_filtering': False, 'div_reg_weight': 0.0, 'dropout_prob': 0.3, 'e_name': 'VAESKConv', 'e_path': 'weights/AESKConv_240_100.bin', 'emo_rep': None, 'emotion_dims': 8, 'emotion_f': 0, 'epoch_stage': 0, 'epochs': 400, 'eval_model': 'motion_representation', 'f_encoder': 'null', 'f_fix_pre': False, 'f_pre_encoder': 'null', 'fac_prob': 1.0, 'facial_dims': 100, 'facial_f': 0, 'facial_fps': 15, 'facial_norm': False, 'facial_rep': 'smplxflame_30', 'fid_weight': 0.0, 'finger_net': 'original', 'freeze_wordembed': True, 'fsmr': 0.0, 'ftmr': 0.0, 'fusion_mode': 'sum', 'g_name': 'MAGE_Transformer', 'gap_weight': 0.0, 'gpus': [0], 'grad_norm': 0.99, 'hidden_size': 768, 'id_rep': 'onehot', 'input_context': 'both', 'is_train': True, 'ita_weight': 0.0, 'iwa_weight': 0.0, 'kld_aud_weight': 0.0, 'kld_fac_weight': 0.0, 'kld_weight': 0.0, 'l': 4, 'lf': 3.0, 'lh': 3.0, 'll': 3.0, 'loader_workers': 0, 'log_period': 10, 'loss_contrastive_neg_weight': 0.005, 'loss_contrastive_pos_weight': 0.2, 'loss_gan_weight': 5.0, 'loss_kld_weight': 0.1, 'loss_physical_weight': 0.0, 'loss_reg_weight': 0.05, 'loss_regression_weight': 70.0, 'lr_base': 0.0005, 'lr_min': 1e-07, 'lr_policy': 'step', 'lu': 3.0, 'm_decoder': None, 'm_encoder': 'null', 'm_fix_pre': False, 'm_pre_encoder': 'null', 'mean_pose_path': '/datasets/trinity/train/', 'model': 'emage_audio', 'momentum': 0.8, 'motion_f': 256, 'msmr': 0.0, 'mtmr': 0.0, 'multi_length_training': [1.0], 'n_layer': 1, 'n_poses': 34, 'n_pre_poses': 4, 'name': '0409_112135_emage_test_hf', 'nesterov': True, 'new_cache': True, 'no_adv_epoch': 999, 'notes': '', 'opt': 'adam', 'opt_betas': [0.5, 0.999], 'ori_joints': 'beat_smplx_joints', 'out_path': './outputs/audio2pose/', 'pos_encoding_type': 'sin', 'pos_prob': 1.0, 'pose_dims': 330, 'pose_fps': 30, 'pose_length': 64, 'pose_norm': False, 'pose_rep': 'smplxflame_30', 'pre_frames': 4, 'pre_type': 'zero', 'pretrain': False, 'project': 's2g', 'queue_size': 1024, 'random_seed': 2021, 'rec_aud_weight': 0.0, 'rec_fac_weight': 0.0, 'rec_pos_weight': 0.0, 'rec_txt_weight': 0.0, 'rec_ver_weight': 0.0, 'rec_weight': 1.0, 'render_concurrent_num': 1, 'render_tmp_img_filetype': 'bmp', 'render_video_fps': 30, 'render_video_height': 720, 'render_video_width': 1920, 'root_path': './', 'rot6d': True, 'sem_rep': None, 'sparse': 1, 'speaker_dims': 4, 'speaker_f': 0, 'speaker_id': 'onehot', 'stat': 'ts', 'std_pose_path': '/datasets/trinity/train/', 'stride': 20, 't_encoder': None, 't_fix_pre': False, 't_pre_encoder': None, 'tar_joints': 'beat_smplx_full', 'test_ckpt': './EMAGE/emage_audio_175.bin', 'test_data_path': '/datasets/trinity/test/', 'test_length': 64, 'test_period': 20, 'train_data_path': '/datasets/trinity/train/', 'train_trans': True, 'trainer': 'emage', 'training_speakers': [2], 'tsmr': 0.0, 'ttmr': 0.0, 'txt_prob': 1.0, 'use_aug': False, 'vae_codebook_size': 256, 'vae_grow': [1, 1, 2, 1], 'vae_layer': 4, 'vae_length': 240, 'vae_quantizer_lambda': 1.0, 'vae_test_dim': 330, 'vae_test_len': 32, 'vae_test_stride': 20, 'val_data_path': '/datasets/trinity/val/', 'variational': False, 'vel_weight': 0.0, 'warmup_epochs': 0, 'warmup_lr': 0.0005, 'wei_weight': 0.0, 'weight_decay': 0.0, 'word_cache': False, 'word_dims': 300, 'word_f': 0, 'word_index_num': 5793, 'word_rep': None, 'z_type': 'speaker'} 2024-04-09 11:21:35.381 | INFO | utils.other_tools_hf:print_exp_info:878 - # ------------ 0409_112135_emage_test_hf ----------- # 2024-04-09 11:21:35.382 | INFO | utils.other_tools_hf:print_exp_info:879 - PyTorch version: 2.2.0+cu121 2024-04-09 11:21:35.382 | INFO | utils.other_tools_hf:print_exp_info:880 - CUDA version: 12.1 2024-04-09 11:21:35.411 | INFO | utils.other_tools_hf:print_exp_info:881 - 1 GPUs 2024-04-09 11:21:35.411 | INFO | utils.other_tools_hf:print_exp_info:882 - Random Seed: 2021 /tmp/tmpxrjlchkhash.wav 2024-04-09 11:21:44.407 | INFO | dataloaders.beat_testonly_hf:build_cache:90 - Audio bit rate: 16000 2024-04-09 11:21:44.408 | INFO | dataloaders.beat_testonly_hf:build_cache:91 - Reading data './EMAGE/test_sequences/'... 2024-04-09 11:21:44.408 | INFO | dataloaders.beat_testonly_hf:build_cache:92 - Creating the dataset cache... 2024-04-09 11:21:44.410 | INFO | dataloaders.beat_testonly_hf:cache_generation:140 - # ---- Building cache for Pose dummy 2nd ---- # 2024-04-09 11:21:48.702 | INFO | dataloaders.beat_testonly_hf:cache_generation:214 - # ---- Building cache for Facial dummy 2nd and Pose dummy 2nd ---- # 2024-04-09 11:21:48.705 | INFO | dataloaders.beat_testonly_hf:cache_generation:260 - # ---- Building cache for Audio dummy 2nd and Pose dummy 2nd ---- # 22050 (169201,) (122777,) 1906 2024-04-09 11:21:48.711 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:517 - audio: 7s, pose: 63s, facial: 63s 2024-04-09 11:21:48.712 | WARNING | dataloaders.beat_testonly_hf:_sample_from_clip:521 - reduce to 7s, ignore 56s 2024-04-09 11:21:48.712 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:544 - pose from frame 0 to 210, length 210 2024-04-09 11:21:48.712 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:545 - 1 clips is expected with stride 210 2024-04-09 11:21:48.712 | INFO | dataloaders.beat_testonly_hf:_sample_from_clip:549 - audio from frame 0 to 112000, length 112000 2024-04-09 11:21:49.020 | INFO | dataloaders.beat_testonly_hf:cache_generation:478 - no. of samples: 1 2024-04-09 11:21:49.020 | INFO | dataloaders.beat_testonly_hf:cache_generation:483 - no. of excluded samples: 0 (0.0%) 2024-04-09 11:21:49.021 | INFO | predict:__init__:610 - Init test dataloader success 2024-04-09 11:21:50.586 | INFO | predict:__init__:623 - DataParallel( (module): MAGE_Transformer( (audio_pre_encoder_face): WavEncoder( (feat_extractor): Sequential( (0): BasicBlock( (conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (2): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (3): BasicBlock( (conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (4): BasicBlock( (conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (5): BasicBlock( (conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (audio_pre_encoder_body): WavEncoder( (feat_extractor): Sequential( (0): BasicBlock( (conv1): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(1, 64, kernel_size=(15,), stride=(5,), padding=(1600,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (1): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 64, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (2): BasicBlock( (conv1): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(64, 64, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (3): BasicBlock( (conv1): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(64, 128, kernel_size=(15,), stride=(6,)) (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) (4): BasicBlock( (conv1): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(128, 128, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) ) (5): BasicBlock( (conv1): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act1): LeakyReLU(negative_slope=0.01, inplace=True) (conv2): Conv1d(256, 256, kernel_size=(15,), stride=(1,), padding=(7,)) (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (act2): LeakyReLU(negative_slope=0.01, inplace=True) (downsample): Sequential( (0): Conv1d(128, 256, kernel_size=(15,), stride=(3,)) (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) ) ) ) ) (motion_encoder): VQEncoderV6( (main): Sequential( (0): Conv1d(337, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): ResBlock( (model): Sequential( (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) (3): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (4): LeakyReLU(negative_slope=0.2, inplace=True) (5): ResBlock( (model): Sequential( (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) (6): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (7): LeakyReLU(negative_slope=0.2, inplace=True) (8): ResBlock( (model): Sequential( (0): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,)) ) ) ) ) (feature2face): Linear(in_features=512, out_features=768, bias=True) (face2latent): Linear(in_features=768, out_features=256, bias=True) (transformer_de_layer): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) (face_decoder): TransformerDecoder( (layers): ModuleList( (0-3): 4 x TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (position_embeddings): PeriodicPositionalEncoding( (dropout): Dropout(p=0.1, inplace=False) ) (transformer_en_layer): TransformerEncoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) ) (motion_self_encoder): TransformerEncoder( (layers): ModuleList( (0): TransformerEncoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) ) ) ) (audio_feature2motion): Linear(in_features=256, out_features=768, bias=True) (feature2motion): Linear(in_features=256, out_features=768, bias=True) (bodyhints_face): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (bodyhints_body): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (motion2latent_upper): MLP( (mlp): Sequential( (0): Linear(in_features=768, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=768, bias=True) ) ) (motion2latent_hands): MLP( (mlp): Sequential( (0): Linear(in_features=768, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=768, bias=True) ) ) (motion2latent_lower): MLP( (mlp): Sequential( (0): Linear(in_features=768, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=768, bias=True) ) ) (wordhints_decoder): TransformerDecoder( (layers): ModuleList( (0-7): 8 x TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (upper_decoder): TransformerDecoder( (layers): ModuleList( (0): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (hands_decoder): TransformerDecoder( (layers): ModuleList( (0): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (lower_decoder): TransformerDecoder( (layers): ModuleList( (0): TransformerDecoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (multihead_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) ) (linear1): Linear(in_features=768, out_features=1536, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1536, out_features=768, bias=True) (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) (dropout3): Dropout(p=0.1, inplace=False) ) ) ) (face_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (upper_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (hands_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (lower_classifier): MLP( (mlp): Sequential( (0): Linear(in_features=256, out_features=768, bias=True) (1): LeakyReLU(negative_slope=0.2, inplace=True) (2): Linear(in_features=768, out_features=256, bias=True) ) ) (motion_down_upper): Linear(in_features=768, out_features=256, bias=True) (motion_down_hands): Linear(in_features=768, out_features=256, bias=True) (motion_down_lower): Linear(in_features=768, out_features=256, bias=True) (spearker_encoder_body): Embedding(25, 768) (spearker_encoder_face): Embedding(25, 768) ) ) 2024-04-09 11:21:50.590 | INFO | predict:__init__:624 - init MAGE_Transformer success 2024-04-09 11:21:51.714 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 11:21:51.730 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 11:21:51.749 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 11:21:51.781 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 11:21:51.813 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for VAESKConv 2024-04-09 11:21:52.376 | INFO | utils.other_tools_hf:load_checkpoints:1042 - load self-pretrained checkpoints for MAGE_Transformer generate_silent_videos concurrentNum=1 time=1712661715.849715 subprocess_index=0 begin_ts=1712661717.4629269 processed 0 frames processed 100 frames subprocess_index=0 render=107.60 all=109.60 begin_ts=1712661717.46 render_end_ts=1712661825.06 write_end_ts=1712661827.06 ffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers built with gcc 12 (Debian 12.2.0-14) configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared libavutil 57. 28.100 / 57. 28.100 libavcodec 59. 37.100 / 59. 37.100 libavformat 59. 27.100 / 59. 27.100 libavdevice 59. 7.100 / 59. 7.100 libavfilter 8. 44.100 / 8. 44.100 libswscale 6. 7.100 / 6. 7.100 libswresample 4. 7.100 / 4. 7.100 libpostproc 56. 6.100 / 56. 6.100 Input #0, image2, from './outputs/audio2pose/custom/hf//999/frame_%d.bmp': Duration: 00:00:06.00, start: 0.000000, bitrate: N/A Stream #0:0: Video: bmp, bgr24, 1920x720, 30 fps, 30 tbr, 30 tbn Stream mapping: Stream #0:0 -> #0:0 (bmp (native) -> h264 (libx264)) Press [q] to stop, [?] for help [libx264 @ 0x5d27f62a4100] using cpu capabilities: MMX2 SSE2Fast SSSE3 SSE4.2 AVX FMA3 BMI2 AVX2 [libx264 @ 0x5d27f62a4100] profile High, level 4.0, 4:2:0, 8-bit [libx264 @ 0x5d27f62a4100] 264 - core 164 r3095 baee400 - H.264/MPEG-4 AVC codec - Copyleft 2003-2022 - http://www.videolan.org/x264.html - options: cabac=1 ref=3 deblock=1:0:0 analyse=0x3:0x113 me=hex subme=7 psy=1 psy_rd=1.00:0.00 mixed_ref=1 me_range=16 chroma_me=1 trellis=1 8x8dct=1 cqm=0 deadzone=21,11 fast_pskip=1 chroma_qp_offset=-2 threads=22 lookahead_threads=3 sliced_threads=0 nr=0 decimate=1 interlaced=0 bluray_compat=0 constrained_intra=0 bframes=3 b_pyramid=2 b_adapt=1 b_bias=0 direct=1 weightb=1 open_gop=0 weightp=2 keyint=250 keyint_min=25 scenecut=40 intra_refresh=0 rc_lookahead=40 rc=crf mbtree=1 crf=23.0 qcomp=0.60 qpmin=0 qpmax=69 qpstep=4 ip_ratio=1.40 aq=1:1.00 Output #0, mp4, to './outputs/audio2pose/custom/hf//999/silence_video.mp4': Metadata: encoder : Lavf59.27.100 Stream #0:0: Video: h264 (avc1 / 0x31637661), yuv420p(tv, progressive), 1920x720, q=2-31, 30 fps, 15360 tbn Metadata: encoder : Lavc59.37.100 libx264 Side data: cpb: bitrate max/min/avg: 0/0/0 buffer size: 0 vbv_delay: N/A frame= 1 fps=0.0 q=0.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x frame= 37 fps=0.0 q=0.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x frame= 64 fps= 53 q=0.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x frame= 96 fps= 54 q=29.0 size= 0kB time=00:00:00.90 bitrate= 0.4kbits/s speed=0.511x frame= 128 fps= 56 q=29.0 size= 0kB time=00:00:01.96 bitrate= 0.2kbits/s speed=0.867x frame= 160 fps= 58 q=29.0 size= 256kB time=00:00:03.03 bitrate= 691.5kbits/s speed=1.09x frame= 170 fps= 49 q=29.0 size= 256kB time=00:00:03.36 bitrate= 623.0kbits/s speed=0.965x frame= 180 fps= 44 q=-1.0 Lsize= 544kB time=00:00:05.90 bitrate= 754.7kbits/s speed=1.45x video:541kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.544116% [libx264 @ 0x5d27f62a4100] frame I:1 Avg QP:22.59 size: 19472 [libx264 @ 0x5d27f62a4100] frame P:45 Avg QP:24.95 size: 6556 [libx264 @ 0x5d27f62a4100] frame B:134 Avg QP:29.78 size: 1779 [libx264 @ 0x5d27f62a4100] consecutive B-frames: 0.6% 0.0% 1.7% 97.8% [libx264 @ 0x5d27f62a4100] mb I I16..4: 12.2% 79.2% 8.6% [libx264 @ 0x5d27f62a4100] mb P I16..4: 0.4% 1.3% 0.5% P16..4: 11.0% 3.8% 1.7% 0.0% 0.0% skip:81.3% [libx264 @ 0x5d27f62a4100] mb B I16..4: 0.0% 0.1% 0.0% B16..8: 7.3% 1.6% 0.3% direct: 0.2% skip:90.4% L0:45.0% L1:45.5% BI: 9.5% [libx264 @ 0x5d27f62a4100] 8x8 transform intra:68.2% inter:45.3% [libx264 @ 0x5d27f62a4100] coded y,uvDC,uvAC intra: 26.1% 0.0% 0.0% inter: 1.6% 0.0% 0.0% [libx264 @ 0x5d27f62a4100] i16 v,h,dc,p: 30% 20% 11% 40% [libx264 @ 0x5d27f62a4100] i8 v,h,dc,ddl,ddr,vr,hd,vl,hu: 46% 9% 32% 2% 2% 3% 3% 3% 1% [libx264 @ 0x5d27f62a4100] i4 v,h,dc,ddl,ddr,vr,hd,vl,hu: 35% 15% 17% 6% 5% 6% 5% 8% 4% [libx264 @ 0x5d27f62a4100] i8c dc,h,v,p: 100% 0% 0% 0% [libx264 @ 0x5d27f62a4100] Weighted P-Frames: Y:0.0% UV:0.0% [libx264 @ 0x5d27f62a4100] ref P L0: 57.1% 8.6% 21.5% 12.8% [libx264 @ 0x5d27f62a4100] ref B L0: 82.7% 13.0% 4.2% [libx264 @ 0x5d27f62a4100] ref B L1: 95.2% 4.8% [libx264 @ 0x5d27f62a4100] kb/s:737.16 Video conversion successful. Output file: ./outputs/audio2pose/custom/hf//999/silence_video.mp4 ffmpeg version 5.1.4-0+deb12u1 Copyright (c) 2000-2023 the FFmpeg developers built with gcc 12 (Debian 12.2.0-14) configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzimg --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --disable-sndio --enable-libjxl --enable-pocketsphinx --enable-librsvg --enable-libmfx --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libx264 --enable-libplacebo --enable-librav1e --enable-shared libavutil 57. 28.100 / 57. 28.100 libavcodec 59. 37.100 / 59. 37.100 libavformat 59. 27.100 / 59. 27.100 libavdevice 59. 7.100 / 59. 7.100 libavfilter 8. 44.100 / 8. 44.100 libswscale 6. 7.100 / 6. 7.100 libswresample 4. 7.100 / 4. 7.100 libpostproc 56. 6.100 / 56. 6.100 Input #0, mov,mp4,m4a,3gp,3g2,mj2, from './outputs/audio2pose/custom/hf//999/silence_video.mp4': Metadata: major_brand : isom minor_version : 512 compatible_brands: isomiso2avc1mp41 encoder : Lavf59.27.100 Duration: 00:00:06.00, start: 0.000000, bitrate: 742 kb/s Stream #0:0[0x1](und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, 738 kb/s, 30 fps, 30 tbr, 15360 tbn (default) Metadata: handler_name : VideoHandler vendor_id : [0][0][0][0] encoder : Lavc59.37.100 libx264 Guessed Channel Layout for Input Stream #1.0 : stereo Input #1, wav, from '/tmp/tmpxrjlchkhash.wav': Metadata: encoder : Lavf58.76.100 Duration: 00:00:07.67, bitrate: 1536 kb/s Stream #1:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, stereo, s16, 1536 kb/s Stream mapping: Stream #0:0 -> #0:0 (copy) Stream #1:0 -> #0:1 (pcm_s16le (native) -> aac (native)) Press [q] to stop, [?] for help Output #0, mp4, to './outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4': Metadata: major_brand : isom minor_version : 512 compatible_brands: isomiso2avc1mp41 encoder : Lavf59.27.100 Stream #0:0(und): Video: h264 (High) (avc1 / 0x31637661), yuv420p(progressive), 1920x720, q=2-31, 738 kb/s, 30 fps, 30 tbr, 15360 tbn (default) Metadata: handler_name : VideoHandler vendor_id : [0][0][0][0] encoder : Lavc59.37.100 libx264 Stream #0:1: Audio: aac (LC) (mp4a / 0x6134706D), 48000 Hz, stereo, fltp, 128 kb/s Metadata: encoder : Lavc59.37.100 aac frame= 0 fps=0.0 q=-1.0 size= 0kB time=00:00:00.00 bitrate=N/A speed= 0x frame= 171 fps=0.0 q=-1.0 size= 512kB time=00:00:05.63 bitrate= 744.8kbits/s speed=10.8x frame= 180 fps=0.0 q=-1.0 Lsize= 643kB time=00:00:06.01 bitrate= 875.0kbits/s speed= 11x video:541kB audio:94kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 1.232579% [aac @ 0x5bacce29f280] Qavg: 1877.850 Video with audio generated successfully: ./outputs/audio2pose/custom/hf//999/res_2_scott_0_3_3.mp4
Want to make some of these yourself?
Run this model