吴吴大庸
updated the project based on https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0/tree/main
a5130bc
| import numpy as np | |
| import torch | |
| from tqdm import tqdm | |
| import math | |
| from einops import rearrange | |
| import sys | |
| sys.path.append(".") | |
| from opensora.eval.flolpips.pwcnet import Network as PWCNet | |
| from opensora.eval.flolpips.flolpips import FloLPIPS | |
| loss_fn = FloLPIPS(net='alex', version='0.1').eval().requires_grad_(False) | |
| flownet = PWCNet().eval().requires_grad_(False) | |
| def trans(x): | |
| return x | |
| def calculate_flolpips(videos1, videos2, device): | |
| global loss_fn, flownet | |
| print("calculate_flowlpips...") | |
| loss_fn = loss_fn.to(device) | |
| flownet = flownet.to(device) | |
| if videos1.shape != videos2.shape: | |
| print("Warning: the shape of videos are not equal.") | |
| min_frames = min(videos1.shape[1], videos2.shape[1]) | |
| videos1 = videos1[:, :min_frames] | |
| videos2 = videos2[:, :min_frames] | |
| videos1 = trans(videos1) | |
| videos2 = trans(videos2) | |
| flolpips_results = [] | |
| for video_num in tqdm(range(videos1.shape[0])): | |
| video1 = videos1[video_num].to(device) | |
| video2 = videos2[video_num].to(device) | |
| frames_rec = video1[:-1] | |
| frames_rec_next = video1[1:] | |
| frames_gt = video2[:-1] | |
| frames_gt_next = video2[1:] | |
| t, c, h, w = frames_gt.shape | |
| flow_gt = flownet(frames_gt, frames_gt_next) | |
| flow_dis = flownet(frames_rec, frames_rec_next) | |
| flow_diff = flow_gt - flow_dis | |
| flolpips = loss_fn.forward(frames_gt, frames_rec, flow_diff, normalize=True) | |
| flolpips_results.append(flolpips.cpu().numpy().tolist()) | |
| flolpips_results = np.array(flolpips_results) # [batch_size, num_frames] | |
| flolpips = {} | |
| flolpips_std = {} | |
| for clip_timestamp in range(flolpips_results.shape[1]): | |
| flolpips[clip_timestamp] = np.mean(flolpips_results[:,clip_timestamp], axis=-1) | |
| flolpips_std[clip_timestamp] = np.std(flolpips_results[:,clip_timestamp], axis=-1) | |
| result = { | |
| "value": flolpips, | |
| "value_std": flolpips_std, | |
| "video_setting": video1.shape, | |
| "video_setting_name": "time, channel, heigth, width", | |
| "result": flolpips_results, | |
| "details": flolpips_results.tolist() | |
| } | |
| return result | |
| # test code / using example | |
| def main(): | |
| NUMBER_OF_VIDEOS = 8 | |
| VIDEO_LENGTH = 50 | |
| CHANNEL = 3 | |
| SIZE = 64 | |
| videos1 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) | |
| videos2 = torch.zeros(NUMBER_OF_VIDEOS, VIDEO_LENGTH, CHANNEL, SIZE, SIZE, requires_grad=False) | |
| import json | |
| result = calculate_flolpips(videos1, videos2, "cuda:0") | |
| print(json.dumps(result, indent=4)) | |
| if __name__ == "__main__": | |
| main() |