Spaces:
Runtime error
Runtime error
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| import sys | |
| import os | |
| import os.path as osp | |
| import argparse | |
| import numpy as np | |
| from tqdm import tqdm | |
| import torch | |
| import torch.backends.cudnn as cudnn | |
| import cv2 | |
| from threading import Thread | |
| from queue import Queue | |
| import json | |
| import torch.multiprocessing as mp | |
| from functools import partial | |
| from io import StringIO | |
| import signal | |
| def signal_handler(sig, frame): | |
| print("\nInterrupted by user, shutting down...") | |
| if 'loader_thread' in globals() and loader_thread.is_alive(): | |
| loader_thread.join(timeout=1.0) # Give the thread 1 second to finish | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() # Free GPU memory immediately | |
| os.exit(0) | |
| # Register the signal handler | |
| signal.signal(signal.SIGINT, signal_handler) | |
| import _init_paths | |
| from _init_paths import get_path | |
| from utils.utilitys import PreProcess, load_json, plot_keypoint, write | |
| from config import cfg, update_config | |
| from utils.transforms import * | |
| from utils.inference import get_final_preds | |
| import models | |
| sys.path.pop(0) | |
| pre_dir, cur_dir, chk_root, data_root, lib_root, output_root = get_path(__file__) | |
| cfg_dir = pre_dir + '/experiments/coco/hrnet/' | |
| model_dir = chk_root + 'hrnet/pose_coco/' | |
| sys.path.insert(0, lib_root) | |
| from detector import load_model as yolo_model | |
| from detector import yolo_human_det as yolo_det | |
| from track.sort import Sort | |
| sys.path.pop(0) | |
| # Set multiprocessing start method | |
| mp.set_start_method('spawn', force=True) | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description='Train keypoints network') | |
| parser.add_argument('--cfg', type=str, default=cfg_dir + 'w48_384x288_adam_lr1e-3.yaml') | |
| parser.add_argument('opts', nargs=argparse.REMAINDER, default=None) | |
| parser.add_argument('--modelDir', type=str, default=model_dir + 'pose_hrnet_w48_384x288.pth') | |
| parser.add_argument('--det-dim', type=int, default=416) | |
| parser.add_argument('--thred-score', type=float, default=0.50) | |
| parser.add_argument('-a', '--animation', action='store_true', help='output animation') | |
| parser.add_argument('-np', '--num-person', type=int, default=1) | |
| parser.add_argument("-v", "--video", type=str, default='camera') | |
| parser.add_argument('--batch-size', type=int, default=8) # Reduced batch size | |
| args = parser.parse_args() | |
| return args | |
| def reset_config(args): | |
| update_config(cfg, args) | |
| cudnn.benchmark = cfg.CUDNN.BENCHMARK | |
| torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC | |
| torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED | |
| def model_load(config, use_fp16=False): | |
| model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(config, is_train=False) | |
| state_dict = torch.load(config.OUTPUT_DIR, map_location=torch.device('cpu')) | |
| from collections import OrderedDict | |
| new_state_dict = OrderedDict() | |
| for k, v in state_dict.items(): | |
| new_state_dict[k] = v | |
| model.load_state_dict(new_state_dict) | |
| if torch.cuda.is_available() and use_fp16: | |
| model = model.half().cuda() # Use FP16 if specified and CUDA available | |
| elif torch.cuda.is_available(): | |
| model = model.cuda() | |
| model.eval() | |
| return model | |
| def load_default_model(): | |
| args = parse_args() | |
| reset_config(args) | |
| model = eval('models.' + cfg.MODEL.NAME + '.get_pose_net')(cfg, is_train=False) | |
| if torch.cuda.is_available(): | |
| model = model.cuda() | |
| state_dict = torch.load(cfg.OUTPUT_DIR) | |
| from collections import OrderedDict | |
| new_state_dict = OrderedDict() | |
| for k, v in state_dict.items(): | |
| new_state_dict[k] = v | |
| model.load_state_dict(new_state_dict) | |
| model.eval() | |
| return model | |
| def frame_loader(video, queue, video_length): | |
| cap = cv2.VideoCapture(video) | |
| for _ in range(video_length): | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| queue.put(frame) | |
| queue.put(None) | |
| cap.release() | |
| def process_batch(frames, human_model, pose_model, det_dim, num_person, thred_score, use_fp16, device, people_sort): | |
| if not frames: | |
| return [], [] | |
| batch_bboxs = [] | |
| batch_centers = [] | |
| batch_scales = [] | |
| batch_inputs = [] | |
| for frame in frames: | |
| bboxs, _ = yolo_det(frame, human_model, reso=det_dim, confidence=thred_score) | |
| if bboxs is None or not bboxs.any(): | |
| continue | |
| people_track = people_sort.update(bboxs) | |
| if people_track.shape[0] == 0: | |
| continue | |
| num_to_track = min(num_person, people_track.shape[0]) | |
| people_track_ = people_track[-num_to_track:, :-1] | |
| track_bboxs = np.round(people_track_, 2).tolist() | |
| inputs, _, center, scale = PreProcess(frame, track_bboxs, cfg, num_to_track) | |
| inputs = inputs[:, [2, 1, 0]] # BGR to RGB | |
| batch_bboxs.append(track_bboxs) | |
| batch_centers.append(center) | |
| batch_scales.append(scale) | |
| batch_inputs.append(inputs) | |
| if not batch_inputs: | |
| return [], [] | |
| inputs = torch.cat(batch_inputs, dim=0).to(device) | |
| if use_fp16 and device.type == 'cuda': | |
| inputs = inputs.half() # Convert to FP16 to match model precision | |
| with torch.no_grad(): | |
| outputs = pose_model(inputs) | |
| outputs = outputs.cpu().float() # Ensure output is FP32 for post-processing | |
| kpts_result = [] | |
| scores_result = [] | |
| offset = 0 | |
| for i, (center, scale) in enumerate(zip(batch_centers, batch_scales)): | |
| batch_size = len(batch_bboxs[i]) | |
| preds, maxvals = get_final_preds(cfg, outputs[offset:offset + batch_size].numpy(), | |
| np.asarray(center).flatten(), np.asarray(scale).flatten()) | |
| offset += batch_size | |
| kpts = np.zeros((batch_size, 17, 2), dtype=np.float32) | |
| scores = np.zeros((batch_size, 17), dtype=np.float32) | |
| for j in range(batch_size): | |
| kpts[j] = preds[j] | |
| scores[j] = maxvals[j].squeeze() | |
| kpts_result.append(kpts) | |
| scores_result.append(scores) | |
| return kpts_result, scores_result | |
| def gen_video_kpts(video, det_dim=416, num_person=1, gen_output=False, batch_size=8, animation=False): | |
| def force_exit(sig, frame): | |
| print("\nForce terminating...") | |
| os._exit(1) | |
| signal.signal(signal.SIGINT, force_exit) | |
| args = parse_args() | |
| reset_config(args) | |
| cap = cv2.VideoCapture(video) | |
| assert cap.isOpened(), 'Cannot capture source' | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| torch.set_num_threads(max(1, mp.cpu_count() - 1)) # Match thread count to processes | |
| torch.autograd.set_grad_enabled(False) # Explicitly disable gradients | |
| # Determine FP16 usage based on device capability | |
| use_fp16 = device.type == 'cuda' and torch.cuda.get_device_capability()[0] >= 7 | |
| batch_size = min(batch_size, torch.cuda.get_device_properties(0).total_memory // (1024**3) if device.type == 'cuda' else mp.cpu_count()) | |
| human_model = yolo_model(inp_dim=det_dim) | |
| pose_model = model_load(cfg, use_fp16=use_fp16).to(device) | |
| people_sort = Sort() | |
| video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| if animation: | |
| kpts_result = [] | |
| scores_result = [] | |
| for i in range(video_length): | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| bboxs, scores = yolo_det(frame, human_model, reso=det_dim, confidence=args.thred_score) | |
| if bboxs is None or not bboxs.any(): | |
| continue | |
| people_track = people_sort.update(bboxs) | |
| if people_track.shape[0] == 1: | |
| people_track_ = people_track[-1, :-1].reshape(1, 4) | |
| elif people_track.shape[0] >= 2: | |
| people_track_ = people_track[-num_person:, :-1].reshape(num_person, 4) | |
| people_track_ = people_track_[::-1] | |
| else: | |
| continue | |
| track_bboxs = [] | |
| for bbox in people_track_: | |
| bbox = [round(i, 2) for i in list(bbox)] | |
| track_bboxs.append(bbox) | |
| with torch.no_grad(): | |
| inputs, origin_img, center, scale = PreProcess(frame, track_bboxs, cfg, num_person) | |
| inputs = inputs[:, [2, 1, 0]] # BGR to RGB | |
| if device.type == 'cuda': | |
| inputs = inputs.cuda() | |
| if use_fp16: | |
| inputs = inputs.half() # Convert to FP16 if model is in FP16 | |
| output = pose_model(inputs) | |
| preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale)) | |
| if gen_output: | |
| kpts = np.zeros((num_person, 17, 2), dtype=np.float32) | |
| scores = np.zeros((num_person, 17), dtype=np.float32) | |
| for j, kpt in enumerate(preds): | |
| kpts[j] = kpt | |
| for j, score in enumerate(maxvals): | |
| scores[j] = score.squeeze() | |
| kpts_result.append(kpts) | |
| scores_result.append(scores) | |
| else: | |
| index_bboxs = [bbox + [j] for j, bbox in enumerate(track_bboxs)] | |
| list(map(lambda x: write(x, frame), index_bboxs)) | |
| plot_keypoint(frame, preds, maxvals, 0.3) | |
| cv2.imshow('frame', frame) | |
| key = cv2.waitKey(1) | |
| if key & 0xFF == ord('q'): | |
| break | |
| else: | |
| frame_queue = Queue(maxsize=batch_size) # Use regular Queue instead of mp.Queue | |
| loader_thread = Thread(target=frame_loader, args=(video, frame_queue, video_length)) | |
| loader_thread.start() | |
| # Pre-allocate result arrays | |
| max_frames = video_length | |
| kpts_result = np.zeros((max_frames, num_person, 17, 2), dtype=np.float32) | |
| scores_result = np.zeros((max_frames, num_person, 17), dtype=np.float32) | |
| frame_idx = 0 | |
| people_sort = Sort() | |
| try: | |
| if device.type == 'cuda': | |
| batch_frames = [] | |
| with torch.no_grad(): | |
| for i in range(video_length): | |
| frame = frame_queue.get(timeout=1.0) | |
| if frame is None: | |
| break | |
| batch_frames.append(frame) | |
| if len(batch_frames) >= batch_size: | |
| kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model, | |
| det_dim, num_person, args.thred_score, | |
| use_fp16, device, people_sort) | |
| for kpts, scores in zip(kpts_batch, scores_batch): | |
| kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] | |
| scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] | |
| frame_idx += 1 | |
| batch_frames = [] | |
| # Print progress every batch | |
| if i % batch_size == 0: | |
| progress = ((i + 1) / video_length) * 100 | |
| print(f"PROGRESS:{progress:.2f}%") | |
| # Process remaining frames | |
| if batch_frames: | |
| kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model, | |
| det_dim, num_person, args.thred_score, | |
| use_fp16, device, people_sort) | |
| for kpts, scores in zip(kpts_batch, scores_batch): | |
| kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] | |
| scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] | |
| frame_idx += 1 | |
| progress = (frame_idx / video_length) * 100 | |
| print(f"PROGRESS:{progress:.2f}%") | |
| else: | |
| # Sequential processing for CPU to avoid multiprocessing overhead | |
| batch_frames = [] | |
| with torch.no_grad(): | |
| for i in range(video_length): | |
| frame = frame_queue.get(timeout=1.0) | |
| if frame is None: | |
| break | |
| batch_frames.append(frame) | |
| if len(batch_frames) >= batch_size: | |
| kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model, | |
| det_dim, num_person, args.thred_score, | |
| use_fp16, device, people_sort) | |
| for kpts, scores in zip(kpts_batch, scores_batch): | |
| kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] | |
| scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] | |
| frame_idx += 1 | |
| batch_frames = [] | |
| # Print progress every batch | |
| if i % batch_size == 0: | |
| progress = ((i + 1) / video_length) * 100 | |
| print(f"PROGRESS:{progress:.2f}%") | |
| # Process remaining frames | |
| if batch_frames: | |
| kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model, | |
| det_dim, num_person, args.thred_score, | |
| use_fp16, device, people_sort) | |
| for kpts, scores in zip(kpts_batch, scores_batch): | |
| kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] | |
| scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] | |
| frame_idx += 1 | |
| progress = (frame_idx / video_length) * 100 | |
| print(f"PROGRESS:{progress:.2f}%") | |
| except Exception as e: | |
| loader_thread.join() | |
| raise | |
| finally: | |
| loader_thread.join() | |
| cap.release() | |
| if device.type == 'cuda': | |
| torch.cuda.empty_cache() # Free GPU memory | |
| if gen_output and kpts_result.any(): | |
| keypoints = kpts_result[:frame_idx].transpose(1, 0, 2, 3) | |
| scores = scores_result[:frame_idx].transpose(1, 0, 2) | |
| return keypoints, scores | |
| return None, None | |
| def gen_img_kpts(image, human_model, pose_model, human_sort, det_dim=416, num_person=2): | |
| args = parse_args() | |
| reset_config(args) | |
| thred_score = args.thred_score | |
| bboxs, bbox_scores = yolo_det(image, human_model, reso=det_dim, confidence=thred_score) | |
| if bboxs is None or not bboxs.any(): | |
| return None, None, None | |
| people_track = human_sort.update(bboxs) | |
| if people_track.shape[0] == 1: | |
| bboxs_track = people_track[-1].reshape(1, 5) | |
| else: | |
| people_track_ = people_track[-num_person:].reshape(num_person, 5) | |
| bboxs_track = people_track_[::-1] | |
| with torch.no_grad(): | |
| inputs, origin_img, center, scale = PreProcess(image, bboxs_track, cfg, num_person) | |
| inputs = inputs[:, [2, 1, 0]] | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| use_fp16 = device.type == 'cuda' and torch.cuda.get_device_capability()[0] >= 7 | |
| if device.type == 'cuda': | |
| inputs = inputs.cuda() | |
| if use_fp16: | |
| inputs = inputs.half() # Match model precision | |
| output = pose_model(inputs) | |
| preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale)) | |
| kpts = np.zeros((num_person, 17, 2), dtype=np.float32) | |
| scores = np.zeros((num_person, 17, 1), dtype=np.float32) | |
| for i, kpt in enumerate(preds): | |
| kpts[i] = kpt | |
| for i, score in enumerate(maxvals): | |
| scores[i] = score | |
| human_indexes = [bboxs_track[i, -1] for i in range(len(bboxs_track))] | |
| return kpts, scores, human_indexes | |
| def generate_ntu_kpts_json(video_path, kpts_file): | |
| args = parse_args() | |
| reset_config(args) | |
| human_model = yolo_model() | |
| pose_model = model_load(cfg) | |
| people_sort = Sort() | |
| with torch.no_grad(): | |
| cap = cv2.VideoCapture(video_path) | |
| video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| kpts_info = dict() | |
| data = [] | |
| for i in tqdm(range(video_length), unit="%", ncols=100): | |
| frame_info = {'frame_index': i + 1} | |
| ret, frame = cap.read() | |
| if not ret: | |
| continue | |
| try: | |
| bboxs, scores = yolo_det(frame, human_model, confidence=args.thred_score) | |
| if bboxs is None or not bboxs.any(): | |
| continue | |
| people_track = people_sort.update(bboxs) | |
| if people_track.shape[0] == 1: | |
| people_track_ = people_track[-1, :-1].reshape(1, 4) | |
| elif people_track.shape[0] >= 2: | |
| people_track_ = people_track[-2:, :-1].reshape(2, 4) | |
| people_track_ = people_track_[::-1] | |
| else: | |
| skeleton = {'skeleton': [{'pose': [], 'score': [], 'bbox': []}]} | |
| frame_info.update(skeleton) | |
| data.append(frame_info) | |
| continue | |
| track_bboxs = [] | |
| for bbox in people_track_: | |
| bbox = [round(i, 3) for i in list(bbox)] | |
| track_bboxs.append(bbox) | |
| except Exception: | |
| continue | |
| inputs, origin_img, center, scale = PreProcess(frame, track_bboxs, cfg, args.num_person) | |
| inputs = inputs[:, [2, 1, 0]] | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| use_fp16 = device.type == 'cuda' and torch.cuda.get_device_capability()[0] >= 7 | |
| if device.type == 'cuda': | |
| inputs = inputs.cuda() | |
| if use_fp16: | |
| inputs = inputs.half() # Match model precision | |
| output = pose_model(inputs) | |
| preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale)) | |
| skeleton = [] | |
| for num, bbox in enumerate(track_bboxs): | |
| pose = preds[num].tolist() | |
| score = maxvals[num].tolist() | |
| pose = round_list(pose) | |
| score = round_list(score) | |
| one_skeleton = {'pose': pose, 'score': score, 'bbox': bbox} | |
| skeleton.append(one_skeleton) | |
| frame_info.update({'skeleton': skeleton}) | |
| data.append(frame_info) | |
| kpts_info.update({'data': data}) | |
| with open(kpts_file, 'w') as fw: | |
| json.dump(kpts_info, fw) | |
| cap.release() | |
| def round_list(input_list, decimals=3): | |
| dim = len(input_list) | |
| for i in range(dim): | |
| for j in range(len(input_list[i])): | |
| input_list[i][j] = round(input_list[i][j], decimals) | |
| return input_list | |
| if __name__ == "__main__": | |
| args = parse_args() | |
| video_path = args.video | |
| if args.animation: | |
| gen_video_kpts(video_path, det_dim=args.det_dim, num_person=args.num_person, | |
| gen_output=False, animation=True) | |
| else: | |
| keypoints, scores = gen_video_kpts(video_path, det_dim=416, num_person=1, gen_output=True, batch_size=8) # Increased batch_size to 8 | |
| if keypoints is not None: | |
| output_file = "output.npz" | |
| np.savez(output_file, keypoints=keypoints, scores=scores) |