Spaces:
Running
Running
| # This file is largely borrowed from open clip | |
| import hashlib | |
| import json | |
| import logging | |
| import os | |
| import re | |
| import urllib | |
| import warnings | |
| from copy import deepcopy | |
| from dataclasses import dataclass, asdict | |
| from functools import partial | |
| from pathlib import Path | |
| from typing import Any, Optional, Tuple | |
| from typing import Dict, Union | |
| from typing import List | |
| import torch | |
| import torch.nn as nn | |
| import torchvision.transforms.functional as F | |
| from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \ | |
| CenterCrop | |
| from tqdm import tqdm | |
| from .clip_model import CLIP, convert_to_custom_text_state_dict, \ | |
| resize_pos_embed | |
| from .clip_model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype | |
| from .tokenizer import HFTokenizer, tokenize | |
| __version__ = '2.16.0' | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| hf_hub_download = partial(hf_hub_download, library_name="open_clip", library_version=__version__) | |
| _has_hf_hub = True | |
| except ImportError: | |
| hf_hub_download = None | |
| _has_hf_hub = False | |
| def _pcfg(url='', hf_hub='', mean=None, std=None): | |
| return dict( | |
| url=url, | |
| hf_hub=hf_hub, | |
| mean=mean, | |
| std=std, | |
| ) | |
| _VITB32 = dict( | |
| openai=_pcfg( | |
| "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"), | |
| laion400m_e31=_pcfg( | |
| "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"), | |
| laion400m_e32=_pcfg( | |
| "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"), | |
| laion2b_e16=_pcfg( | |
| "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-laion2b_e16-af8dbd0c.pth"), | |
| laion2b_s34b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-laion2B-s34B-b79K/') | |
| ) | |
| _VITB16 = dict( | |
| openai=_pcfg( | |
| "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt"), | |
| laion400m_e31=_pcfg( | |
| "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e31-00efa78f.pt"), | |
| laion400m_e32=_pcfg( | |
| "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e32-55e67d44.pt"), | |
| laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-laion2B-s34B-b88K/'), | |
| ) | |
| _VITL14 = dict( | |
| openai=_pcfg( | |
| "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt"), | |
| laion400m_e31=_pcfg( | |
| "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e31-69988bb6.pt"), | |
| laion400m_e32=_pcfg( | |
| "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e32-3d133497.pt"), | |
| laion2b_s32b_b82k=_pcfg( | |
| hf_hub='laion/CLIP-ViT-L-14-laion2B-s32B-b82K/', | |
| mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), | |
| ) | |
| _VITL14_336 = dict( | |
| openai=_pcfg( | |
| "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt"), | |
| ) | |
| _VITH14 = dict( | |
| laion2b_s32b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-H-14-laion2B-s32B-b79K/'), | |
| ) | |
| _VITg14 = dict( | |
| laion2b_s12b_b42k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s12B-b42K/'), | |
| laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s34B-b88K/'), | |
| ) | |
| _VITbigG14 = dict( | |
| laion2b_s39b_b160k=_pcfg(hf_hub='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/'), | |
| ) | |
| _PRETRAINED = { | |
| "ViT-B-32": _VITB32, | |
| "ViT-B-16": _VITB16, | |
| "ViT-L-14": _VITL14, | |
| "ViT-L-14-336": _VITL14_336, | |
| "ViT-H-14": _VITH14, | |
| "ViT-g-14": _VITg14, | |
| "ViT-bigG-14": _VITbigG14, | |
| } | |
| def _clean_tag(tag: str): | |
| # normalize pretrained tags | |
| return tag.lower().replace('-', '_') | |
| def list_pretrained(as_str: bool = False): | |
| """ returns list of pretrained models | |
| Returns a tuple (model_name, pretrain_tag) by default or 'name:tag' if as_str == True | |
| """ | |
| return [':'.join([k, t]) if as_str else (k, t) for k in _PRETRAINED.keys() for t in _PRETRAINED[k].keys()] | |
| def list_pretrained_models_by_tag(tag: str): | |
| """ return all models having the specified pretrain tag """ | |
| models = [] | |
| tag = _clean_tag(tag) | |
| for k in _PRETRAINED.keys(): | |
| if tag in _PRETRAINED[k]: | |
| models.append(k) | |
| return models | |
| def list_pretrained_tags_by_model(model: str): | |
| """ return all pretrain tags for the specified model architecture """ | |
| tags = [] | |
| if model in _PRETRAINED: | |
| tags.extend(_PRETRAINED[model].keys()) | |
| return tags | |
| def is_pretrained_cfg(model: str, tag: str): | |
| if model not in _PRETRAINED: | |
| return False | |
| return _clean_tag(tag) in _PRETRAINED[model] | |
| def get_pretrained_cfg(model: str, tag: str): | |
| if model not in _PRETRAINED: | |
| return {} | |
| model_pretrained = _PRETRAINED[model] | |
| if 'openai' in model_pretrained.keys(): | |
| tag = 'openai' | |
| else: | |
| tag = list(model_pretrained.keys())[0] | |
| print('*' * 50) | |
| print(f'Use pretrained model from {tag}...') | |
| print('*' * 50) | |
| return model_pretrained.get(_clean_tag(tag), {}) | |
| def get_pretrained_url(model: str, tag: str): | |
| cfg = get_pretrained_cfg(model, _clean_tag(tag)) | |
| return cfg.get('url', '') | |
| def download_pretrained_from_url( | |
| url: str, | |
| cache_dir: Union[str, None] = None, | |
| ): | |
| if not cache_dir: | |
| cache_dir = os.path.expanduser("~/.cache/clip") | |
| os.makedirs(cache_dir, exist_ok=True) | |
| filename = os.path.basename(url) | |
| if 'openaipublic' in url: | |
| expected_sha256 = url.split("/")[-2] | |
| elif 'mlfoundations' in url: | |
| expected_sha256 = os.path.splitext(filename)[0].split("-")[-1] | |
| else: | |
| expected_sha256 = '' | |
| download_target = os.path.join(cache_dir, filename) | |
| if os.path.exists(download_target) and not os.path.isfile(download_target): | |
| raise RuntimeError(f"{download_target} exists and is not a regular file") | |
| if os.path.isfile(download_target): | |
| if expected_sha256: | |
| if hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256): | |
| return download_target | |
| else: | |
| warnings.warn( | |
| f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") | |
| else: | |
| return download_target | |
| with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: | |
| with tqdm(total=int(source.headers.get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop: | |
| while True: | |
| buffer = source.read(8192) | |
| if not buffer: | |
| break | |
| output.write(buffer) | |
| loop.update(len(buffer)) | |
| if expected_sha256 and not hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith( | |
| expected_sha256): | |
| raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match") | |
| return download_target | |
| def has_hf_hub(necessary=False): | |
| if not _has_hf_hub and necessary: | |
| # if no HF Hub module installed, and it is necessary to continue, raise error | |
| raise RuntimeError( | |
| 'Hugging Face hub model specified but package not installed. Run `pip install huggingface_hub`.') | |
| return _has_hf_hub | |
| def download_pretrained_from_hf( | |
| model_id: str, | |
| filename: str = 'open_clip_pytorch_model.bin', | |
| revision=None, | |
| cache_dir: Union[str, None] = None, | |
| ): | |
| has_hf_hub(True) | |
| cached_file = hf_hub_download(model_id, filename, revision=revision, cache_dir=cache_dir) | |
| return cached_file | |
| def download_pretrained( | |
| cfg: Dict, | |
| force_hf_hub: bool = False, | |
| cache_dir: Union[str, None] = None, | |
| ): | |
| target = '' | |
| if not cfg: | |
| return target | |
| download_url = cfg.get('url', '') | |
| download_hf_hub = cfg.get('hf_hub', '') | |
| if download_hf_hub and force_hf_hub: | |
| # use HF hub even if url exists | |
| download_url = '' | |
| if download_url: | |
| target = download_pretrained_from_url(download_url, cache_dir=cache_dir) | |
| elif download_hf_hub: | |
| has_hf_hub(True) | |
| # we assume the hf_hub entries in pretrained config combine model_id + filename in | |
| # 'org/model_name/filename.pt' form. To specify just the model id w/o filename and | |
| # use 'open_clip_pytorch_model.bin' default, there must be a trailing slash 'org/model_name/'. | |
| model_id, filename = os.path.split(download_hf_hub) | |
| if filename: | |
| target = download_pretrained_from_hf(model_id, filename=filename, cache_dir=cache_dir) | |
| else: | |
| target = download_pretrained_from_hf(model_id, cache_dir=cache_dir) | |
| return target | |
| OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) | |
| OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) | |
| class AugmentationCfg: | |
| scale: Tuple[float, float] = (0.9, 1.0) | |
| ratio: Optional[Tuple[float, float]] = None | |
| color_jitter: Optional[Union[float, Tuple[float, float, float]]] = None | |
| interpolation: Optional[str] = None | |
| re_prob: Optional[float] = None | |
| re_count: Optional[int] = None | |
| use_timm: bool = False | |
| class ResizeMaxSize(nn.Module): | |
| def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0): | |
| super().__init__() | |
| if not isinstance(max_size, int): | |
| raise TypeError(f"Size should be int. Got {type(max_size)}") | |
| self.max_size = max_size | |
| self.interpolation = interpolation | |
| self.fn = min if fn == 'min' else min | |
| self.fill = fill | |
| def forward(self, img): | |
| if isinstance(img, torch.Tensor): | |
| height, width = img.shape[:2] | |
| else: | |
| width, height = img.size | |
| scale = self.max_size / float(max(height, width)) | |
| if scale != 1.0: | |
| new_size = tuple(round(dim * scale) for dim in (height, width)) | |
| img = F.resize(img, new_size, self.interpolation) | |
| pad_h = self.max_size - new_size[0] | |
| pad_w = self.max_size - new_size[1] | |
| img = F.pad(img, padding=[pad_w // 2, pad_h // 2, pad_w - pad_w // 2, pad_h - pad_h // 2], fill=self.fill) | |
| return img | |
| def _convert_to_rgb(image): | |
| return image.convert('RGB') | |
| def image_transform( | |
| image_size: int, | |
| is_train: bool, | |
| mean: Optional[Tuple[float, ...]] = None, | |
| std: Optional[Tuple[float, ...]] = None, | |
| resize_longest_max: bool = False, | |
| fill_color: int = 0, | |
| aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None, | |
| ): | |
| mean = mean or OPENAI_DATASET_MEAN | |
| if not isinstance(mean, (list, tuple)): | |
| mean = (mean,) * 3 | |
| std = std or OPENAI_DATASET_STD | |
| if not isinstance(std, (list, tuple)): | |
| std = (std,) * 3 | |
| if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]: | |
| # for square size, pass size as int so that Resize() uses aspect preserving shortest edge | |
| image_size = image_size[0] | |
| if isinstance(aug_cfg, dict): | |
| aug_cfg = AugmentationCfg(**aug_cfg) | |
| else: | |
| aug_cfg = aug_cfg or AugmentationCfg() | |
| normalize = Normalize(mean=mean, std=std) | |
| if is_train: | |
| aug_cfg_dict = {k: v for k, v in asdict(aug_cfg).items() if v is not None} | |
| use_timm = aug_cfg_dict.pop('use_timm', False) | |
| if use_timm: | |
| from timm.data import create_transform # timm can still be optional | |
| if isinstance(image_size, (tuple, list)): | |
| assert len(image_size) >= 2 | |
| input_size = (3,) + image_size[-2:] | |
| else: | |
| input_size = (3, image_size, image_size) | |
| # by default, timm aug randomly alternates bicubic & bilinear for better robustness at inference time | |
| aug_cfg_dict.setdefault('interpolation', 'random') | |
| aug_cfg_dict.setdefault('color_jitter', None) # disable by default | |
| train_transform = create_transform( | |
| input_size=input_size, | |
| is_training=True, | |
| hflip=0., | |
| mean=mean, | |
| std=std, | |
| re_mode='pixel', | |
| **aug_cfg_dict, | |
| ) | |
| else: | |
| train_transform = Compose([ | |
| RandomResizedCrop( | |
| image_size, | |
| scale=aug_cfg_dict.pop('scale'), | |
| interpolation=InterpolationMode.BICUBIC, | |
| ), | |
| _convert_to_rgb, | |
| ToTensor(), | |
| normalize, | |
| ]) | |
| if aug_cfg_dict: | |
| warnings.warn( | |
| f'Unused augmentation cfg items, specify `use_timm` to use ({list(aug_cfg_dict.keys())}).') | |
| return train_transform | |
| else: | |
| if resize_longest_max: | |
| transforms = [ | |
| ResizeMaxSize(image_size, fill=fill_color) | |
| ] | |
| else: | |
| transforms = [ | |
| Resize(image_size, interpolation=InterpolationMode.BICUBIC), | |
| CenterCrop(image_size), | |
| ] | |
| transforms.extend([ | |
| _convert_to_rgb, | |
| ToTensor(), | |
| normalize, | |
| ]) | |
| return Compose(transforms) | |
| def list_openai_models() -> List[str]: | |
| """Returns the names of available CLIP models""" | |
| return list_pretrained_models_by_tag('openai') | |
| def load_openai_model( | |
| name: str, | |
| precision: Optional[str] = None, | |
| device: Optional[Union[str, torch.device]] = None, | |
| jit: bool = True, | |
| cache_dir: Optional[str] = None, | |
| ): | |
| """Load a CLIP model | |
| Parameters | |
| ---------- | |
| name : str | |
| A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict | |
| precision: str | |
| Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'. | |
| device : Union[str, torch.device] | |
| The device to put the loaded model | |
| jit : bool | |
| Whether to load the optimized JIT model (default) or more hackable non-JIT model. | |
| cache_dir : Optional[str] | |
| The directory to cache the downloaded model weights | |
| Returns | |
| ------- | |
| model : torch.nn.Module | |
| The CLIP model | |
| preprocess : Callable[[PIL.Image], torch.Tensor] | |
| A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input | |
| """ | |
| if device is None: | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| if precision is None: | |
| precision = 'fp32' if device == 'cpu' else 'fp16' | |
| cfg = get_pretrained_cfg(name, 'openai') | |
| if cfg: | |
| model_path = download_pretrained(cfg, cache_dir=cache_dir) | |
| elif os.path.isfile(name): | |
| model_path = name | |
| else: | |
| raise RuntimeError(f"Model {name} not found; available models = {list_pretrained()}") | |
| try: | |
| # loading JIT archive | |
| model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() | |
| state_dict = None | |
| except RuntimeError: | |
| # loading saved state dict | |
| if jit: | |
| warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead") | |
| jit = False | |
| state_dict = torch.load(model_path, map_location="cpu") | |
| # JIT -> Just In Time | |
| if not jit: | |
| # Build a non-jit model from the OpenAI jitted model state dict | |
| cast_dtype = get_cast_dtype(precision) | |
| try: | |
| model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype) | |
| except KeyError: | |
| sd = {k[7:]: v for k, v in state_dict["state_dict"].items()} | |
| model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype) | |
| # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use | |
| model = model.to(device) | |
| if precision.startswith('amp') or precision == 'fp32': | |
| model.float() | |
| elif precision == 'bf16': | |
| convert_weights_to_lp(model, dtype=torch.bfloat16) | |
| return model | |
| # patch the device names | |
| device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) | |
| device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1] | |
| def patch_device(module): | |
| try: | |
| graphs = [module.graph] if hasattr(module, "graph") else [] | |
| except RuntimeError: | |
| graphs = [] | |
| if hasattr(module, "forward1"): | |
| graphs.append(module.forward1.graph) | |
| for graph in graphs: | |
| for node in graph.findAllNodes("prim::Constant"): | |
| if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"): | |
| node.copyAttributes(device_node) | |
| model.apply(patch_device) | |
| patch_device(model.encode_image) | |
| patch_device(model.encode_text) | |
| # patch dtype to float32 (typically for CPU) | |
| if precision == 'fp32': | |
| float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[]) | |
| float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] | |
| float_node = float_input.node() | |
| def patch_float(module): | |
| try: | |
| graphs = [module.graph] if hasattr(module, "graph") else [] | |
| except RuntimeError: | |
| graphs = [] | |
| if hasattr(module, "forward1"): | |
| graphs.append(module.forward1.graph) | |
| for graph in graphs: | |
| for node in graph.findAllNodes("aten::to"): | |
| inputs = list(node.inputs()) | |
| for i in [1, 2]: # dtype can be the second or third argument to aten::to() | |
| if inputs[i].node()["value"] == 5: | |
| inputs[i].node().copyAttributes(float_node) | |
| model.apply(patch_float) | |
| patch_float(model.encode_image) | |
| patch_float(model.encode_text) | |
| model.float() | |
| # ensure image_size attr available at consistent location for both jit and non-jit | |
| model.visual.image_size = model.input_resolution.item() | |
| return model | |
| HF_HUB_PREFIX = 'hf-hub:' | |
| _MODEL_CONFIG_PATHS = [Path(__file__).parent.parent / f"./model_configs/"] | |
| _MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs | |
| def _natural_key(string_): | |
| return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())] | |
| def _rescan_model_configs(): | |
| global _MODEL_CONFIGS | |
| config_ext = ('.json',) | |
| config_files = [] | |
| for config_path in _MODEL_CONFIG_PATHS: | |
| if config_path.is_file() and config_path.suffix in config_ext: | |
| config_files.append(config_path) | |
| elif config_path.is_dir(): | |
| for ext in config_ext: | |
| config_files.extend(config_path.glob(f'*{ext}')) | |
| for cf in config_files: | |
| with open(cf, 'r') as f: | |
| model_cfg = json.load(f) | |
| if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')): | |
| _MODEL_CONFIGS[cf.stem] = model_cfg | |
| _MODEL_CONFIGS = {k: v for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))} | |
| _rescan_model_configs() # initial populate of model config registry | |
| def list_models(): | |
| """ enumerate available model architectures based on config files """ | |
| return list(_MODEL_CONFIGS.keys()) | |
| def add_model_config(path): | |
| """ add model config path or file and update registry """ | |
| if not isinstance(path, Path): | |
| path = Path(path) | |
| _MODEL_CONFIG_PATHS.append(path) | |
| _rescan_model_configs() | |
| def get_model_config(model_name): | |
| if model_name in _MODEL_CONFIGS: | |
| return deepcopy(_MODEL_CONFIGS[model_name]) | |
| else: | |
| return None | |
| def get_tokenizer(model_name): | |
| if model_name.startswith(HF_HUB_PREFIX): | |
| tokenizer = HFTokenizer(model_name[len(HF_HUB_PREFIX):]) | |
| else: | |
| config = get_model_config(model_name) | |
| tokenizer = HFTokenizer( | |
| config['text_cfg']['hf_tokenizer_name']) if 'hf_tokenizer_name' in config['text_cfg'] else tokenize | |
| return tokenizer | |
| def load_state_dict(checkpoint_path: str, map_location='cpu'): | |
| checkpoint = torch.load(checkpoint_path, map_location=map_location) | |
| if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: | |
| state_dict = checkpoint['state_dict'] | |
| else: | |
| state_dict = checkpoint | |
| if next(iter(state_dict.items()))[0].startswith('module'): | |
| state_dict = {k[7:]: v for k, v in state_dict.items()} | |
| return state_dict | |
| def load_checkpoint(model, checkpoint_path, strict=True): | |
| state_dict = load_state_dict(checkpoint_path) | |
| # detect old format and make compatible with new format | |
| if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'): | |
| state_dict = convert_to_custom_text_state_dict(state_dict) | |
| resize_pos_embed(state_dict, model) | |
| incompatible_keys = model.load_state_dict(state_dict, strict=strict) | |
| return incompatible_keys | |
| def create_model( | |
| model_name: str, | |
| img_size: int, | |
| pretrained: Optional[str] = None, | |
| precision: str = 'fp32', | |
| device: Union[str, torch.device] = 'cpu', | |
| jit: bool = False, | |
| cache_dir: Optional[str] = None, | |
| output_dict: Optional[bool] = None, | |
| ): | |
| if model_name.count('ViT') < 1: | |
| print('only support ViT model..') | |
| raise NotImplementedError | |
| # in which means, we can also use old naming rules. | |
| model_name = model_name.replace('/', '-') # for callers using old naming with / in ViT names | |
| checkpoint_path = None | |
| pretrained_cfg = {} | |
| model_cfg = None | |
| if isinstance(device, str): | |
| device = torch.device(device) | |
| # our default version are borrowed from openai | |
| assert pretrained and pretrained.lower() == 'openai', 'only support openai module.' | |
| logging.info(f'Loading pretrained {model_name} from OpenAI.') | |
| model_cfg = model_cfg or get_model_config(model_name) | |
| model_cfg['vision_cfg']['image_size'] = img_size | |
| cast_dtype = get_cast_dtype(precision) | |
| model_pre = load_openai_model( | |
| model_name, | |
| precision=precision, | |
| device=device, | |
| jit=jit, | |
| cache_dir=cache_dir, | |
| ) | |
| state_dict = model_pre.state_dict() | |
| # to always output dict even if it is clip | |
| if output_dict and hasattr(model_pre, "output_dict"): | |
| model_pre.output_dict = True | |
| model = CLIP(**model_cfg, cast_dtype=cast_dtype) | |
| # mainly need to resize the position embeddings | |
| resize_pos_embed(state_dict, model) | |
| incompatible_keys = model.load_state_dict(state_dict, strict=True) | |
| model.to(device=device) | |
| if precision in ("fp16", "bf16"): | |
| convert_weights_to_lp(model, dtype=torch.bfloat16 if precision == 'bf16' else torch.float16) | |
| # set image / mean metadata from pretrained_cfg if available, or use default | |
| model.visual.image_mean = pretrained_cfg.get('mean', None) or OPENAI_DATASET_MEAN | |
| model.visual.image_std = pretrained_cfg.get('std', None) or OPENAI_DATASET_STD | |
| # to always output dict even if it is clip | |
| if output_dict and hasattr(model, "output_dict"): | |
| model.output_dict = True | |
| if jit: | |
| model = torch.jit.script(model) | |
| return model | |
| def create_model_and_transforms( | |
| model_name: str, | |
| img_size: int, | |
| pretrained: Optional[str] = None, | |
| precision: str = 'fp32', | |
| device: Union[str, torch.device] = 'cpu', | |
| jit: bool = False, | |
| image_mean: Optional[Tuple[float, ...]] = None, | |
| image_std: Optional[Tuple[float, ...]] = None, | |
| aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None, | |
| cache_dir: Optional[str] = None, | |
| output_dict: Optional[bool] = None, | |
| ): | |
| ######### create the clip model | |
| model = create_model( | |
| model_name, | |
| img_size, | |
| pretrained, | |
| precision=precision, | |
| device=device, | |
| jit=jit, | |
| cache_dir=cache_dir, | |
| output_dict=output_dict, | |
| ) | |
| image_mean = image_mean or getattr(model.visual, 'image_mean', None) | |
| image_std = image_std or getattr(model.visual, 'image_std', None) | |
| preprocess_train = image_transform( | |
| model.visual.image_size, | |
| is_train=True, | |
| mean=image_mean, | |
| std=image_std, | |
| aug_cfg=aug_cfg, | |
| ) | |
| preprocess_val = image_transform( | |
| model.visual.image_size, | |
| is_train=False, | |
| mean=image_mean, | |
| std=image_std, | |
| ) | |
| return model, preprocess_train, preprocess_val | |