from __future__ import absolute_import from __future__ import division from __future__ import print_function import sys import os import os.path as osp import argparse import numpy as np from tqdm import tqdm import torch import torch.backends.cudnn as cudnn import cv2 from threading import Thread from queue import Queue import json import torch.multiprocessing as mp from functools import partial from io import StringIO import signal def signal_handler(sig, frame): print("\nInterrupted by user, shutting down...") if 'loader_thread' in globals() and loader_thread.is_alive(): loader_thread.join(timeout=1.0) # Give the thread 1 second to finish if torch.cuda.is_available(): torch.cuda.empty_cache() # Free GPU memory immediately os.exit(0) # Register the signal handler signal.signal(signal.SIGINT, signal_handler) import _init_paths from _init_paths import get_path from utils.utilitys import PreProcess, load_json, plot_keypoint, write from config import cfg, update_config from utils.transforms import * from utils.inference import get_final_preds import models sys.path.pop(0) pre_dir, cur_dir, chk_root, data_root, lib_root, output_root = get_path(__file__) cfg_dir = pre_dir + '/experiments/coco/hrnet/' model_dir = chk_root + 'hrnet/pose_coco/' sys.path.insert(0, lib_root) from detector import load_model as yolo_model from detector import yolo_human_det as yolo_det from track.sort import Sort sys.path.pop(0) # Set multiprocessing start method mp.set_start_method('spawn', force=True) def parse_args(): parser = argparse.ArgumentParser(description='Train keypoints network') parser.add_argument('--cfg', type=str, default=cfg_dir + 'w48_384x288_adam_lr1e-3.yaml') parser.add_argument('opts', nargs=argparse.REMAINDER, default=None) parser.add_argument('--modelDir', type=str, default=model_dir + 'pose_hrnet_w48_384x288.pth') parser.add_argument('--det-dim', type=int, default=416) parser.add_argument('--thred-score', type=float, default=0.50) parser.add_argument('-a', '--animation', action='store_true', help='output animation') parser.add_argument('-np', '--num-person', type=int, default=1) parser.add_argument("-v", "--video", type=str, default='camera') parser.add_argument('--batch-size', type=int, default=8) # Reduced batch size args = parser.parse_args() return args def reset_config(args): update_config(cfg, args) cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED def model_load(config, use_fp16=False): model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(config, is_train=False) state_dict = torch.load(config.OUTPUT_DIR, map_location=torch.device('cpu')) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): new_state_dict[k] = v model.load_state_dict(new_state_dict) if torch.cuda.is_available() and use_fp16: model = model.half().cuda() # Use FP16 if specified and CUDA available elif torch.cuda.is_available(): model = model.cuda() model.eval() return model def load_default_model(): args = parse_args() reset_config(args) model = eval('models.' + cfg.MODEL.NAME + '.get_pose_net')(cfg, is_train=False) if torch.cuda.is_available(): model = model.cuda() state_dict = torch.load(cfg.OUTPUT_DIR) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): new_state_dict[k] = v model.load_state_dict(new_state_dict) model.eval() return model def frame_loader(video, queue, video_length): cap = cv2.VideoCapture(video) for _ in range(video_length): ret, frame = cap.read() if not ret: break queue.put(frame) queue.put(None) cap.release() def process_batch(frames, human_model, pose_model, det_dim, num_person, thred_score, use_fp16, device, people_sort): if not frames: return [], [] batch_bboxs = [] batch_centers = [] batch_scales = [] batch_inputs = [] for frame in frames: bboxs, _ = yolo_det(frame, human_model, reso=det_dim, confidence=thred_score) if bboxs is None or not bboxs.any(): continue people_track = people_sort.update(bboxs) if people_track.shape[0] == 0: continue num_to_track = min(num_person, people_track.shape[0]) people_track_ = people_track[-num_to_track:, :-1] track_bboxs = np.round(people_track_, 2).tolist() inputs, _, center, scale = PreProcess(frame, track_bboxs, cfg, num_to_track) inputs = inputs[:, [2, 1, 0]] # BGR to RGB batch_bboxs.append(track_bboxs) batch_centers.append(center) batch_scales.append(scale) batch_inputs.append(inputs) if not batch_inputs: return [], [] inputs = torch.cat(batch_inputs, dim=0).to(device) if use_fp16 and device.type == 'cuda': inputs = inputs.half() # Convert to FP16 to match model precision with torch.no_grad(): outputs = pose_model(inputs) outputs = outputs.cpu().float() # Ensure output is FP32 for post-processing kpts_result = [] scores_result = [] offset = 0 for i, (center, scale) in enumerate(zip(batch_centers, batch_scales)): batch_size = len(batch_bboxs[i]) preds, maxvals = get_final_preds(cfg, outputs[offset:offset + batch_size].numpy(), np.asarray(center).flatten(), np.asarray(scale).flatten()) offset += batch_size kpts = np.zeros((batch_size, 17, 2), dtype=np.float32) scores = np.zeros((batch_size, 17), dtype=np.float32) for j in range(batch_size): kpts[j] = preds[j] scores[j] = maxvals[j].squeeze() kpts_result.append(kpts) scores_result.append(scores) return kpts_result, scores_result def gen_video_kpts(video, det_dim=416, num_person=1, gen_output=False, batch_size=8, animation=False): def force_exit(sig, frame): print("\nForce terminating...") os._exit(1) signal.signal(signal.SIGINT, force_exit) args = parse_args() reset_config(args) cap = cv2.VideoCapture(video) assert cap.isOpened(), 'Cannot capture source' device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch.set_num_threads(max(1, mp.cpu_count() - 1)) # Match thread count to processes torch.autograd.set_grad_enabled(False) # Explicitly disable gradients # Determine FP16 usage based on device capability use_fp16 = device.type == 'cuda' and torch.cuda.get_device_capability()[0] >= 7 batch_size = min(batch_size, torch.cuda.get_device_properties(0).total_memory // (1024**3) if device.type == 'cuda' else mp.cpu_count()) human_model = yolo_model(inp_dim=det_dim) pose_model = model_load(cfg, use_fp16=use_fp16).to(device) people_sort = Sort() video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if animation: kpts_result = [] scores_result = [] for i in range(video_length): ret, frame = cap.read() if not ret: break bboxs, scores = yolo_det(frame, human_model, reso=det_dim, confidence=args.thred_score) if bboxs is None or not bboxs.any(): continue people_track = people_sort.update(bboxs) if people_track.shape[0] == 1: people_track_ = people_track[-1, :-1].reshape(1, 4) elif people_track.shape[0] >= 2: people_track_ = people_track[-num_person:, :-1].reshape(num_person, 4) people_track_ = people_track_[::-1] else: continue track_bboxs = [] for bbox in people_track_: bbox = [round(i, 2) for i in list(bbox)] track_bboxs.append(bbox) with torch.no_grad(): inputs, origin_img, center, scale = PreProcess(frame, track_bboxs, cfg, num_person) inputs = inputs[:, [2, 1, 0]] # BGR to RGB if device.type == 'cuda': inputs = inputs.cuda() if use_fp16: inputs = inputs.half() # Convert to FP16 if model is in FP16 output = pose_model(inputs) preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale)) if gen_output: kpts = np.zeros((num_person, 17, 2), dtype=np.float32) scores = np.zeros((num_person, 17), dtype=np.float32) for j, kpt in enumerate(preds): kpts[j] = kpt for j, score in enumerate(maxvals): scores[j] = score.squeeze() kpts_result.append(kpts) scores_result.append(scores) else: index_bboxs = [bbox + [j] for j, bbox in enumerate(track_bboxs)] list(map(lambda x: write(x, frame), index_bboxs)) plot_keypoint(frame, preds, maxvals, 0.3) cv2.imshow('frame', frame) key = cv2.waitKey(1) if key & 0xFF == ord('q'): break else: frame_queue = Queue(maxsize=batch_size) # Use regular Queue instead of mp.Queue loader_thread = Thread(target=frame_loader, args=(video, frame_queue, video_length)) loader_thread.start() # Pre-allocate result arrays max_frames = video_length kpts_result = np.zeros((max_frames, num_person, 17, 2), dtype=np.float32) scores_result = np.zeros((max_frames, num_person, 17), dtype=np.float32) frame_idx = 0 people_sort = Sort() try: if device.type == 'cuda': batch_frames = [] with torch.no_grad(): for i in range(video_length): frame = frame_queue.get(timeout=1.0) if frame is None: break batch_frames.append(frame) if len(batch_frames) >= batch_size: kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model, det_dim, num_person, args.thred_score, use_fp16, device, people_sort) for kpts, scores in zip(kpts_batch, scores_batch): kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] frame_idx += 1 batch_frames = [] # Print progress every batch if i % batch_size == 0: progress = ((i + 1) / video_length) * 100 print(f"PROGRESS:{progress:.2f}%") # Process remaining frames if batch_frames: kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model, det_dim, num_person, args.thred_score, use_fp16, device, people_sort) for kpts, scores in zip(kpts_batch, scores_batch): kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] frame_idx += 1 progress = (frame_idx / video_length) * 100 print(f"PROGRESS:{progress:.2f}%") else: # Sequential processing for CPU to avoid multiprocessing overhead batch_frames = [] with torch.no_grad(): for i in range(video_length): frame = frame_queue.get(timeout=1.0) if frame is None: break batch_frames.append(frame) if len(batch_frames) >= batch_size: kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model, det_dim, num_person, args.thred_score, use_fp16, device, people_sort) for kpts, scores in zip(kpts_batch, scores_batch): kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] frame_idx += 1 batch_frames = [] # Print progress every batch if i % batch_size == 0: progress = ((i + 1) / video_length) * 100 print(f"PROGRESS:{progress:.2f}%") # Process remaining frames if batch_frames: kpts_batch, scores_batch = process_batch(batch_frames, human_model, pose_model, det_dim, num_person, args.thred_score, use_fp16, device, people_sort) for kpts, scores in zip(kpts_batch, scores_batch): kpts_result[frame_idx:frame_idx + 1] = kpts[None, :num_person] scores_result[frame_idx:frame_idx + 1] = scores[None, :num_person] frame_idx += 1 progress = (frame_idx / video_length) * 100 print(f"PROGRESS:{progress:.2f}%") except Exception as e: loader_thread.join() raise finally: loader_thread.join() cap.release() if device.type == 'cuda': torch.cuda.empty_cache() # Free GPU memory if gen_output and kpts_result.any(): keypoints = kpts_result[:frame_idx].transpose(1, 0, 2, 3) scores = scores_result[:frame_idx].transpose(1, 0, 2) return keypoints, scores return None, None def gen_img_kpts(image, human_model, pose_model, human_sort, det_dim=416, num_person=2): args = parse_args() reset_config(args) thred_score = args.thred_score bboxs, bbox_scores = yolo_det(image, human_model, reso=det_dim, confidence=thred_score) if bboxs is None or not bboxs.any(): return None, None, None people_track = human_sort.update(bboxs) if people_track.shape[0] == 1: bboxs_track = people_track[-1].reshape(1, 5) else: people_track_ = people_track[-num_person:].reshape(num_person, 5) bboxs_track = people_track_[::-1] with torch.no_grad(): inputs, origin_img, center, scale = PreProcess(image, bboxs_track, cfg, num_person) inputs = inputs[:, [2, 1, 0]] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') use_fp16 = device.type == 'cuda' and torch.cuda.get_device_capability()[0] >= 7 if device.type == 'cuda': inputs = inputs.cuda() if use_fp16: inputs = inputs.half() # Match model precision output = pose_model(inputs) preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale)) kpts = np.zeros((num_person, 17, 2), dtype=np.float32) scores = np.zeros((num_person, 17, 1), dtype=np.float32) for i, kpt in enumerate(preds): kpts[i] = kpt for i, score in enumerate(maxvals): scores[i] = score human_indexes = [bboxs_track[i, -1] for i in range(len(bboxs_track))] return kpts, scores, human_indexes def generate_ntu_kpts_json(video_path, kpts_file): args = parse_args() reset_config(args) human_model = yolo_model() pose_model = model_load(cfg) people_sort = Sort() with torch.no_grad(): cap = cv2.VideoCapture(video_path) video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) kpts_info = dict() data = [] for i in tqdm(range(video_length), unit="%", ncols=100): frame_info = {'frame_index': i + 1} ret, frame = cap.read() if not ret: continue try: bboxs, scores = yolo_det(frame, human_model, confidence=args.thred_score) if bboxs is None or not bboxs.any(): continue people_track = people_sort.update(bboxs) if people_track.shape[0] == 1: people_track_ = people_track[-1, :-1].reshape(1, 4) elif people_track.shape[0] >= 2: people_track_ = people_track[-2:, :-1].reshape(2, 4) people_track_ = people_track_[::-1] else: skeleton = {'skeleton': [{'pose': [], 'score': [], 'bbox': []}]} frame_info.update(skeleton) data.append(frame_info) continue track_bboxs = [] for bbox in people_track_: bbox = [round(i, 3) for i in list(bbox)] track_bboxs.append(bbox) except Exception: continue inputs, origin_img, center, scale = PreProcess(frame, track_bboxs, cfg, args.num_person) inputs = inputs[:, [2, 1, 0]] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') use_fp16 = device.type == 'cuda' and torch.cuda.get_device_capability()[0] >= 7 if device.type == 'cuda': inputs = inputs.cuda() if use_fp16: inputs = inputs.half() # Match model precision output = pose_model(inputs) preds, maxvals = get_final_preds(cfg, output.clone().cpu().numpy(), np.asarray(center), np.asarray(scale)) skeleton = [] for num, bbox in enumerate(track_bboxs): pose = preds[num].tolist() score = maxvals[num].tolist() pose = round_list(pose) score = round_list(score) one_skeleton = {'pose': pose, 'score': score, 'bbox': bbox} skeleton.append(one_skeleton) frame_info.update({'skeleton': skeleton}) data.append(frame_info) kpts_info.update({'data': data}) with open(kpts_file, 'w') as fw: json.dump(kpts_info, fw) cap.release() def round_list(input_list, decimals=3): dim = len(input_list) for i in range(dim): for j in range(len(input_list[i])): input_list[i][j] = round(input_list[i][j], decimals) return input_list if __name__ == "__main__": args = parse_args() video_path = args.video if args.animation: gen_video_kpts(video_path, det_dim=args.det_dim, num_person=args.num_person, gen_output=False, animation=True) else: keypoints, scores = gen_video_kpts(video_path, det_dim=416, num_person=1, gen_output=True, batch_size=8) # Increased batch_size to 8 if keypoints is not None: output_file = "output.npz" np.savez(output_file, keypoints=keypoints, scores=scores)