Spaces:

PUYONE
/

kohya_ss_colab

Build error

App Files Files Community

PUYONE commited on 23 days ago

Commit

e69a9f5

verified ·

1 Parent(s): a708532

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.github/workflows/typos.yaml +21 -0
.gitignore +11 -0
.gradio/certificate.pem +31 -0
LICENSE.md +201 -0
README.md +17 -8
XTI_hijack.py +209 -0
_typos.toml +15 -0
cache/huggingface/gradio/frpc/frpc_linux_amd64_v0.3 +3 -0
config_README-ja.md +279 -0
config_files/accelerate/default_config.yaml +22 -0
dreambooth_gui.py +944 -0
fine_tune.py +430 -0
fine_tune_README.md +465 -0
fine_tune_README_ja.md +140 -0
finetune/blip/blip.py +240 -0
finetune/blip/med.py +955 -0
finetune/blip/med_config.json +22 -0
finetune/blip/vit.py +305 -0
finetune/clean_captions_and_tags.py +190 -0
finetune/hypernetwork_nai.py +96 -0
finetune/make_captions.py +168 -0
finetune/make_captions_by_git.py +151 -0
finetune/merge_captions_to_metadata.py +76 -0
finetune/merge_dd_tags_to_metadata.py +71 -0
finetune/prepare_buckets_latents.py +267 -0
finetune/tag_images_by_wd14_tagger.py +206 -0
finetune_gui.py +888 -0
gen_img_diffusers.py +0 -0
gui.sh +9 -0
kohya_gui.py +110 -0
kohya_ss_colab.ipynb +448 -0
library/__init__.py +0 -0
library/basic_caption_gui.py +140 -0
library/blip_caption_gui.py +149 -0
library/common_gui.py +978 -0
library/config_util.py +536 -0
library/convert_model_gui.py +247 -0
library/custom_train_functions.py +18 -0
library/dataset_balancing_gui.py +146 -0
library/dreambooth_folder_creation_gui.py +210 -0
library/extract_lora_gui.py +178 -0
library/extract_lycoris_locon_gui.py +309 -0
library/git_caption_gui.py +136 -0
library/lpw_stable_diffusion.py +1179 -0
library/merge_lora_gui.py +156 -0
library/model_util.py +1165 -0
library/resize_lora_gui.py +173 -0
library/sampler_gui.py +102 -0
library/svd_merge_lora_gui.py +190 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+cache/huggingface/gradio/frpc/frpc_linux_amd64_v0.3 filter=lfs diff=lfs merge=lfs -text

.github/workflows/typos.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+---
+# yamllint disable rule:line-length
+name: Typos
+on:  # yamllint disable-line rule:truthy
+  push:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: typos-action
+        uses: crate-ci/typos@v1.13.10

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+venv
+__pycache__
+cudnn_windows
+.vscode
+*.egg-info
+build
+wd14_tagger_model
+.DS_Store
+locon
+gui-user.bat
+gui-user.ps1

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

LICENSE.md ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2022] [kohya-ss]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,21 @@
 ---
-title: Kohya Ss Colab
-emoji: 📈
-colorFrom: indigo
-colorTo: gray
 sdk: gradio
-sdk_version: 5.49.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: kohya_ss_colab
+app_file: dreambooth_gui.py
 sdk: gradio
+sdk_version: 5.47.2
 ---
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/panguin6010/kohya_ss_google_colab/blob/master/kohya_ss_colab.ipynb)
+# Kohya SS WebUI Colab Setup
+This Colab workbook sets up a Kohya SS instance on Colab and provides a link to access the Kohya WebUI on Gradio Live. Kohya SS is a Python library that provides Stable Diffusion-based models for image, text, and audio generation tasks. This Colab workbook provides a convenient way for users to run Kohya SS without needing to install anything on their local machine.
+This workbook was inspired by the work of [Spaceginner](https://github.com/Spaceginner)'s original Colab workbook and the [Kohya SS project](https://github.com/bmaltais/kohya_ss) by [bmaltais](https://github.com/bmaltais). The Colab workbook was coded by [panguin6010](https://github.com/panguin6010)
+## Tutorials
+Before running this code, make sure you are familiar with using Colab workbooks and have a basic understanding of Kohya SS and its usage. You can find tutorials for these online. If you encounter any issues or have suggestions for improvement, feel free to contribute to the project.
+## Link
+```https://colab.research.google.com/github/panguin6010/kohya_ss_google_colab/blob/master/kohya_ss_colab.ipynb```

XTI_hijack.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import torch
+from typing import Union, List, Optional, Dict, Any, Tuple
+from diffusers.models.unet_2d_condition import UNet2DConditionOutput
+def unet_forward_XTI(self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.config.num_class_embeds is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 3. down
+        down_block_res_samples = (sample,)
+        down_i = 0
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states[down_i:down_i+2],
+                )
+                down_i += 2
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states[6])
+        # 5. up
+        up_i = 7
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states[up_i:up_i+3],
+                    upsample_size=upsample_size,
+                )
+                up_i += 3
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample)
+def downblock_forward_XTI(
+    self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None
+):
+    output_states = ()
+    i = 0
+    for resnet, attn in zip(self.resnets, self.attentions):
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module, return_dict=None):
+                def custom_forward(*inputs):
+                    if return_dict is not None:
+                        return module(*inputs, return_dict=return_dict)
+                    else:
+                        return module(*inputs)
+                return custom_forward
+            hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            hidden_states = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states[i]
+            )[0]
+        else:
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states[i]).sample
+        output_states += (hidden_states,)
+        i += 1
+    if self.downsamplers is not None:
+        for downsampler in self.downsamplers:
+            hidden_states = downsampler(hidden_states)
+        output_states += (hidden_states,)
+    return hidden_states, output_states
+def upblock_forward_XTI(
+    self,
+    hidden_states,
+    res_hidden_states_tuple,
+    temb=None,
+    encoder_hidden_states=None,
+    upsample_size=None,
+):
+    i = 0
+    for resnet, attn in zip(self.resnets, self.attentions):
+        # pop res hidden states
+        res_hidden_states = res_hidden_states_tuple[-1]
+        res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module, return_dict=None):
+                def custom_forward(*inputs):
+                    if return_dict is not None:
+                        return module(*inputs, return_dict=return_dict)
+                    else:
+                        return module(*inputs)
+                return custom_forward
+            hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            hidden_states = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states[i]
+            )[0]
+        else:
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states[i]).sample
+        i += 1
+    if self.upsamplers is not None:
+        for upsampler in self.upsamplers:
+            hidden_states = upsampler(hidden_states, upsample_size)
+    return hidden_states

_typos.toml ADDED Viewed

	@@ -0,0 +1,15 @@

+# Files for typos
+# Instruction:  https://github.com/marketplace/actions/typos-action#getting-started
+[default.extend-identifiers]
+[default.extend-words]
+NIN="NIN"
+parms="parms"
+nin="nin"
+extention="extention" # Intentionally left
+nd="nd"
+[files]
+extend-exclude = ["_typos.toml"]

cache/huggingface/gradio/frpc/frpc_linux_amd64_v0.3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c791d1f047b41ff5885772fc4bf20b797c6059bbd82abb9e31de15e55d6a57c4
+size 11907224

config_README-ja.md ADDED Viewed

	@@ -0,0 +1,279 @@

+For non-Japanese speakers: this README is provided only in Japanese in the current state. Sorry for inconvenience. We will provide English version in the near future.
+`--dataset_config` で渡すことができる設定ファイルに関する説明です。
+## 概要
+設定ファイルを渡すことにより、ユーザが細かい設定を行えるようにします。
+* 複数のデータセットが設定可能になります
+    * 例えば `resolution` をデータセットごとに設定して、それらを混合して学習できます。
+    * DreamBooth の手法と fine tuning の手法の両方に対応している学習方法では、DreamBooth 方式と fine tuning 方式のデータセットを混合することが可能です。
+* サブセットごとに設定を変更することが可能になります
+    * データセットを画像ディレクトリ別またはメタデータ別に分割したものがサブセットです。いくつかのサブセットが集まってデータセットを構成します。
+    * `keep_tokens` や `flip_aug` 等のオプションはサブセットごとに設定可能です。一方、`resolution` や `batch_size` といったオプションはデータセットごとに設定可能で、同じデータセットに属するサブセットでは値が共通になります。詳しくは後述します。
+設定ファイルの形式は JSON か TOML を利用できます。記述のしやすさを考えると [TOML](https://toml.io/ja/v1.0.0-rc.2) を利用するのがオススメです。以下、TOML の利用を前提に説明します。
+TOML で記述した設定ファイルの例です。
+```toml
+[general]
+shuffle_caption = true
+caption_extension = '.txt'
+keep_tokens = 1
+# これは DreamBooth 方式のデータセット
+[[datasets]]
+resolution = 512
+batch_size = 4
+keep_tokens = 2
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+  class_tokens = 'hoge girl'
+  # このサブセットは keep_tokens = 2 （所属する datasets の値が使われる）
+  [[datasets.subsets]]
+  image_dir = 'C:\fuga'
+  class_tokens = 'fuga boy'
+  keep_tokens = 3
+  [[datasets.subsets]]
+  is_reg = true
+  image_dir = 'C:\reg'
+  class_tokens = 'human'
+  keep_tokens = 1
+# これは fine tuning 方式のデータセット
+[[datasets]]
+resolution = [768, 768]
+batch_size = 2
+  [[datasets.subsets]]
+  image_dir = 'C:\piyo'
+  metadata_file = 'C:\piyo\piyo_md.json'
+  # このサブセットは keep_tokens = 1 （general の値が使われる）
+```
+この例では、3 つのディレクトリを DreamBooth 方式のデータセットとして 512x512 (batch size 4) で学習させ、1 つのディレクトリを fine tuning 方式のデータセットとして 768x768 (batch size 2) で学習させることになります。
+## データセット・サブセットに関する設定
+データセット・サブセットに関する設定は、登録可能な箇所がいくつかに分かれています。
+* `[general]`
+    * 全データセットまたは全サブセットに適用されるオプションを指定する箇所です。
+    * データセットごとの設定及びサブセットごとの設定に同名のオプションが存在していた場合には、データセット・サブセットごとの設定が優先されます。
+* `[[datasets]]`
+    * `datasets` はデータセットに関する設定の登録箇所になります。各データセットに個別に適用されるオプションを指定する箇所です。
+    * サブセットごとの設定が存在していた場合には、サブセットごとの設定が優先されます。
+* `[[datasets.subsets]]`
+    * `datasets.subsets` はサブセットに関する設定の登録箇所になります。各サブセットに個別に適用されるオプションを指定する箇所です。
+先程の例における、画像ディレクトリと登録箇所の対応に関するイメージ図です。
+```
+C:\
+├─ hoge  ->  [[datasets.subsets]] No.1  ┐                        ┐
+├─ fuga  ->  [[datasets.subsets]] No.2  |->  [[datasets]] No.1   |->  [general]
+├─ reg   ->  [[datasets.subsets]] No.3  ┘                        |
+└─ piyo  ->  [[datasets.subsets]] No.4  -->  [[datasets]] No.2   ┘
+```
+画像ディレクトリがそれぞれ1つの `[[datasets.subsets]]` に対応しています。そして `[[datasets.subsets]]` が1つ以上組み合わさって1つの `[[datasets]]` を構成します。`[general]` には全ての `[[datasets]]`, `[[datasets.subsets]]` が属します。
+登録箇所ごとに指定可能なオプションは異なりますが、同名のオプションが指定された場合は下位の登録箇所にある値が優先されます。先程の例の `keep_tokens` オプションの扱われ方を確認してもらうと理解しやすいかと思います。
+加えて、学習方法が対応している手法によっても指定可能なオプションが変化します。
+* DreamBooth 方式専用のオプション
+* fine tuning 方式専用のオプション
+* caption dropout の手法が使える場合のオプション
+DreamBooth の手法と fine tuning の手法の両方とも利用可能な学習方法では、両者を併用することができます。
+併用する際の注意点として、DreamBooth 方式なのか fine tuning 方式なのかはデータセット単位で判別を行っているため、同じデータセット中に DreamBooth 方式のサブセットと fine tuning 方式のサブセットを混在させることはできません。
+つまり、これらを併用したい場合には異なる方式のサブセットが異なるデータセットに所属するように設定する必要があります。
+プログラムの挙動としては、後述する `metadata_file` オプションが存在していたら fine tuning 方式のサブセットだと判断します。
+そのため、同一のデータセットに所属するサブセットについて言うと、「全てが `metadata_file` オプションを持つ」か「全てが `metadata_file` オプションを持たない」かのどちらかになっていれば問題ありません。
+以下、利用可能なオプションを説明します。コマンドライン引数と名称が同一のオプションについては、基本的に説明を割愛します。他の README を参照してください。
+### 全学習方法で共通のオプション
+学習方法によらずに指定可能なオプションです。
+#### データセット向けオプション
+データセットの設定に関わるオプションです。`datasets.subsets` には記述できません。
+| オプション名 | 設定例 | `[general]` | `[[datasets]]` |
+| ---- | ---- | ---- | ---- |
+| `batch_size` | `1` | o | o |
+| `bucket_no_upscale` | `true` | o | o |
+| `bucket_reso_steps` | `64` | o | o |
+| `enable_bucket` | `true` | o | o |
+| `max_bucket_reso` | `1024` | o | o |
+| `min_bucket_reso` | `128` | o | o |
+| `resolution` | `256`, `[512, 512]` | o | o |
+* `batch_size`
+    * コマンドライン引数の `--train_batch_size` と同等です。
+これらの設定はデータセットごとに固定です。
+つまり、データセットに所属するサブセットはこれらの設定を共有することになります。
+例えば解像度が異なるデータセットを用意したい場合は、上に挙げた例のように別々のデータセットとして定義すれば別々の解像度を設定可能です。
+#### サブセット向けオプション
+サブセットの設定に関わるオプションです。
+| オプション名 | 設定例 | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `color_aug` | `false` | o | o | o |
+| `face_crop_aug_range` | `[1.0, 3.0]` | o | o | o |
+| `flip_aug` | `true` | o | o | o |
+| `keep_tokens` | `2` | o | o | o |
+| `num_repeats` | `10` | o | o | o |
+| `random_crop` | `false` | o | o | o |
+| `shuffle_caption` | `true` | o | o | o |
+* `num_repeats`
+    * サブセットの画像の繰り返し回数を指定します。fine tuning における `--dataset_repeats` に相当しますが、`num_repeats` はどの学習方法でも指定可能です。
+### DreamBooth 方式専用のオプション
+DreamBooth 方式のオプションは、サブセット向けオプションのみ存在します。
+#### サブセット向けオプション
+DreamBooth 方式のサブセットの設定に関わるオプションです。
+| オプション名 | 設定例 | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `image_dir` | `‘C:\hoge’` | - | - | o（必須） |
+| `caption_extension` | `".txt"` | o | o | o |
+| `class_tokens` | `“sks girl”` | - | - | o |
+| `is_reg` | `false` | - | - | o |
+まず注意点として、 `image_dir` には画像ファイルが直下に置かれているパスを指定する必要があります。従来の DreamBooth の手法ではサブディレクトリに画像を置く必要がありましたが、そちらとは仕様に互換性がありません。また、`5_cat` のようなフォルダ名にしても、画像の繰り返し回数とクラス名は反映されません。これらを個別に設定したい場合、`num_repeats` と `class_tokens` で明示的に指定する必要があることに注意してください。
+* `image_dir`
+    * 画像ディレクトリのパスを指定します。指定必須オプションです。
+    * 画像はディレクトリ直下に置かれている必要があります。
+* `class_tokens`
+    * クラストークンを設定します。
+    * 画像に対応する caption ファイルが存在しない場合にのみ学習時に利用されます。利用するかどうかの判定は画像ごとに行います。`class_tokens` を指定しなかった場合に caption ファイル���見つからなかった場合にはエラーになります。
+* `is_reg`
+    * サブセットの画像が正規化用かどうかを指定します。指定しなかった場合は `false` として、つまり正規化画像ではないとして扱います。
+### fine tuning 方式専用のオプション
+fine tuning 方式のオプションは、サブセット向けオプションのみ存在します。
+#### サブセット向けオプション
+fine tuning 方式のサブセットの設定に関わるオプションです。
+| オプション名 | 設定例 | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- | ---- |
+| `image_dir` | `‘C:\hoge’` | - | - | o |
+| `metadata_file` | `'C:\piyo\piyo_md.json'` | - | - | o（必須） |
+* `image_dir`
+    * 画像ディレクトリのパスを指定します。DreamBooth の手法の方とは異なり指定は必須ではありませんが、設定することを推奨します。
+        * 指定する必要がない状況としては、メタデータファイルの生成時に `--full_path` を付与して実行していた場合です。
+    * 画像はディレクトリ直下に置かれている必要があります。
+* `metadata_file`
+    * サブセットで利用されるメタデータファイルのパスを指定します。指定必須オプションです。
+        * コマンドライン引数の `--in_json` と同等です。
+    * サブセットごとにメタデータファイルを指定する必要がある仕様上、ディレクトリを跨いだメタデータを1つのメタデータファイルとして作成することは避けた方が良いでしょう。画像ディレクトリごとにメタデータファイルを用意し、それらを別々のサブセットとして登録することを強く推奨します。
+### caption dropout の手法が使える場合に指定可能なオプション
+caption dropout の手法が使える場合のオプションは、サブセット向けオプションのみ存在します。
+DreamBooth 方式か fine tuning 方式かに関わらず、caption dropout に対応している学習方法であれば指定可能です。
+#### サブセット向けオプション
+caption dropout が使えるサブセットの設定に関わるオプションです。
+| オプション名 | `[general]` | `[[datasets]]` | `[[dataset.subsets]]` |
+| ---- | ---- | ---- | ---- |
+| `caption_dropout_every_n_epochs` | o | o | o |
+| `caption_dropout_rate` | o | o | o |
+| `caption_tag_dropout_rate` | o | o | o |
+## 重複したサブセットが存在する時の挙動
+DreamBooth 方式のデータセットの場合、その中にある `image_dir` が同一のサブセットは重複していると見なされます。
+fine tuning 方式のデータセットの場合は、その中にある `metadata_file` が同一のサブセットは重複していると見なされます。
+データセット中に重複したサブセットが存在する場合、2個目以降は無視されます。
+一方、異なるデータセットに所属している場合は、重複しているとは見なされません。
+例えば、以下のように同一の `image_dir` を持つサブセットを別々のデータセットに入れた場合には、重複していないと見なします。
+これは、同じ画像でも異なる解像度で学習したい場合に役立ちます。
+```toml
+# 別々のデータセットに存在している場合は重複とは見なされず、両方とも学習に使われる
+[[datasets]]
+resolution = 512
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+[[datasets]]
+resolution = 768
+  [[datasets.subsets]]
+  image_dir = 'C:\hoge'
+```
+## コマンドライン引数との併用
+設定ファイルのオプションの中には、コマンドライン引数のオプションと役割が重複しているものがあります。
+以下に挙げるコマンドライン引数のオプションは、設定ファイルを渡した場合には無視されます。
+* `--train_data_dir`
+* `--reg_data_dir`
+* `--in_json`
+以下に挙げるコマンドライン引数のオプションは、コマンドライン引数と設定ファイルで同時に指定された場合、コマンドライン引数の値よりも設定ファイルの値が優先されます。特に断りがなければ同名のオプションとなります。
+| コマンドライン引数のオプション     | 優先される設定ファイルのオプション |
+| ---------------------------------- | ---------------------------------- |
+| `--bucket_no_upscale`              |                                    |
+| `--bucket_reso_steps`              |                                    |
+| `--caption_dropout_every_n_epochs` |                                    |
+| `--caption_dropout_rate`           |                                    |
+| `--caption_extension`              |                                    |
+| `--caption_tag_dropout_rate`       |                                    |
+| `--color_aug`                      |                                    |
+| `--dataset_repeats`                | `num_repeats`                      |
+| `--enable_bucket`                  |                                    |
+| `--face_crop_aug_range`            |                                    |
+| `--flip_aug`                       |                                    |
+| `--keep_tokens`                    |                                    |
+| `--min_bucket_reso`                |                                    |
+| `--random_crop`                    |                                    |
+| `--resolution`                     |                                    |
+| `--shuffle_caption`                |                                    |
+| `--train_batch_size`               | `batch_size`                       |
+## エラーの手引き
+現在、外部ライブラリを利用して設定ファイルの記述が正しいかどうかをチェックしているのですが、整備が行き届いておらずエラーメッセージがわかりづらいという問題があります。
+将来的にはこの問題の改善に取り組む予定です。
+次善策として、頻出のエラーとその対処法について載せておきます。
+正しいはずなのにエラーが出る場合、エラー内容がどうしても分からない場合は、バグかもしれないのでご連絡ください。
+* `voluptuous.error.MultipleInvalid: required key not provided @ ...`: 指定必須のオプションが指定されていないというエラーです。指定を忘れているか、オプション名を間違って記述している可能性が高いです。
+  * `...` の箇所にはエラーが発生した場所が載っています。例えば `voluptuous.error.MultipleInvalid: required key not provided @ data['datasets'][0]['subsets'][0]['image_dir']` のようなエラーが出たら、0 番目の `datasets` 中の 0 番目の `subsets` の設定に `image_dir` が存在しないということになります。
+* `voluptuous.error.MultipleInvalid: expected int for dictionary value @ ...`: 指定する値の形式が不正というエラーです。値の形式が間違っている可能性が高いです。`int` の部分は対象となるオプションによって変わります。この README に載っているオプションの「設定例」が役立つかもしれません。
+* `voluptuous.error.MultipleInvalid: extra keys not allowed @ ...`: 対応していないオプション名が存在している場合に発生するエラーです。オプション名を間違って記述しているか、誤って紛れ込んでいる可能性が高いです。

config_files/accelerate/default_config.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+command_file: null
+commands: null
+compute_environment: LOCAL_MACHINE
+deepspeed_config: {}
+distributed_type: 'NO'
+downcast_bf16: 'no'
+dynamo_backend: 'NO'
+fsdp_config: {}
+gpu_ids: all
+machine_rank: 0
+main_process_ip: null
+main_process_port: null
+main_training_function: main
+megatron_lm_config: {}
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_name: null
+tpu_zone: null
+use_cpu: false

dreambooth_gui.py ADDED Viewed

	@@ -0,0 +1,944 @@

+# v1: initial release
+# v2: add open and save folder icons
+# v3: Add new Utilities tab for Dreambooth folder preparation
+# v3.1: Adding captionning of images to utilities
+import gradio as gr
+import json
+import math
+import os
+import subprocess
+import pathlib
+import argparse
+from library.common_gui import (
+    get_folder_path,
+    remove_doublequote,
+    get_file_path,
+    get_any_file_path,
+    get_saveasfile_path,
+    color_aug_changed,
+    save_inference_file,
+    gradio_advanced_training,
+    run_cmd_advanced_training,
+    run_cmd_training,
+    gradio_training,
+    gradio_config,
+    gradio_source_model,
+    # set_legacy_8bitadam,
+    update_my_data,
+    check_if_model_exist,
+)
+from library.tensorboard_gui import (
+    gradio_tensorboard,
+    start_tensorboard,
+    stop_tensorboard,
+)
+from library.dreambooth_folder_creation_gui import (
+    gradio_dreambooth_folder_creation_tab,
+)
+from library.utilities import utilities_tab
+from library.sampler_gui import sample_gradio_config, run_cmd_sample
+from easygui import msgbox
+folder_symbol = '\U0001f4c2'  # 📂
+refresh_symbol = '\U0001f504'  # 🔄
+save_style_symbol = '\U0001f4be'  # 💾
+document_symbol = '\U0001F4C4'   # 📄
+def save_configuration(
+    save_as,
+    file_path,
+    pretrained_model_name_or_path,
+    v2,
+    v_parameterization,
+    logging_dir,
+    train_data_dir,
+    reg_data_dir,
+    output_dir,
+    max_resolution,
+    learning_rate,
+    lr_scheduler,
+    lr_warmup,
+    train_batch_size,
+    epoch,
+    save_every_n_epochs,
+    mixed_precision,
+    save_precision,
+    seed,
+    num_cpu_threads_per_process,
+    cache_latents,
+    caption_extension,
+    enable_bucket,
+    gradient_checkpointing,
+    full_fp16,
+    no_token_padding,
+    stop_text_encoder_training,
+    # use_8bit_adam,
+    xformers,
+    save_model_as,
+    shuffle_caption,
+    save_state,
+    resume,
+    prior_loss_weight,
+    color_aug,
+    flip_aug,
+    clip_skip,
+    vae,
+    output_name,
+    max_token_length,
+    max_train_epochs,
+    max_data_loader_n_workers,
+    mem_eff_attn,
+    gradient_accumulation_steps,
+    model_list,
+    keep_tokens,
+    persistent_data_loader_workers,
+    bucket_no_upscale,
+    random_crop,
+    bucket_reso_steps,
+    caption_dropout_every_n_epochs,
+    caption_dropout_rate,
+    optimizer,
+    optimizer_args,
+    noise_offset,
+    sample_every_n_steps,
+    sample_every_n_epochs,
+    sample_sampler,
+    sample_prompts,
+    additional_parameters,
+    vae_batch_size,
+    min_snr_gamma,
+):
+    # Get list of function parameters and values
+    parameters = list(locals().items())
+    original_file_path = file_path
+    save_as_bool = True if save_as.get('label') == 'True' else False
+    if save_as_bool:
+        print('Save as...')
+        file_path = get_saveasfile_path(file_path)
+    else:
+        print('Save...')
+        if file_path == None or file_path == '':
+            file_path = get_saveasfile_path(file_path)
+    # print(file_path)
+    if file_path == None or file_path == '':
+        return original_file_path  # In case a file_path was provided and the user decide to cancel the open action
+    # Return the values of the variables as a dictionary
+    variables = {
+        name: value
+        for name, value in parameters  # locals().items()
+        if name
+        not in [
+            'file_path',
+            'save_as',
+        ]
+    }
+    # Extract the destination directory from the file path
+    destination_directory = os.path.dirname(file_path)
+    # Create the destination directory if it doesn't exist
+    if not os.path.exists(destination_directory):
+        os.makedirs(destination_directory)
+    # Save the data to the selected file
+    with open(file_path, 'w') as file:
+        json.dump(variables, file, indent=2)
+    return file_path
+def open_configuration(
+    ask_for_file,
+    file_path,
+    pretrained_model_name_or_path,
+    v2,
+    v_parameterization,
+    logging_dir,
+    train_data_dir,
+    reg_data_dir,
+    output_dir,
+    max_resolution,
+    learning_rate,
+    lr_scheduler,
+    lr_warmup,
+    train_batch_size,
+    epoch,
+    save_every_n_epochs,
+    mixed_precision,
+    save_precision,
+    seed,
+    num_cpu_threads_per_process,
+    cache_latents,
+    caption_extension,
+    enable_bucket,
+    gradient_checkpointing,
+    full_fp16,
+    no_token_padding,
+    stop_text_encoder_training,
+    # use_8bit_adam,
+    xformers,
+    save_model_as,
+    shuffle_caption,
+    save_state,
+    resume,
+    prior_loss_weight,
+    color_aug,
+    flip_aug,
+    clip_skip,
+    vae,
+    output_name,
+    max_token_length,
+    max_train_epochs,
+    max_data_loader_n_workers,
+    mem_eff_attn,
+    gradient_accumulation_steps,
+    model_list,
+    keep_tokens,
+    persistent_data_loader_workers,
+    bucket_no_upscale,
+    random_crop,
+    bucket_reso_steps,
+    caption_dropout_every_n_epochs,
+    caption_dropout_rate,
+    optimizer,
+    optimizer_args,
+    noise_offset,
+    sample_every_n_steps,
+    sample_every_n_epochs,
+    sample_sampler,
+    sample_prompts,
+    additional_parameters,
+    vae_batch_size,
+    min_snr_gamma,
+):
+    # Get list of function parameters and values
+    parameters = list(locals().items())
+    ask_for_file = True if ask_for_file.get('label') == 'True' else False
+    original_file_path = file_path
+    if ask_for_file:
+        file_path = get_file_path(file_path)
+    if not file_path == '' and not file_path == None:
+        # load variables from JSON file
+        with open(file_path, 'r') as f:
+            my_data = json.load(f)
+            print('Loading config...')
+            # Update values to fix deprecated use_8bit_adam checkbox and set appropriate optimizer if it is set to True
+            my_data = update_my_data(my_data)
+    else:
+        file_path = original_file_path  # In case a file_path was provided and the user decide to cancel the open action
+        my_data = {}
+    values = [file_path]
+    for key, value in parameters:
+        # Set the value in the dictionary to the corresponding value in `my_data`, or the default value if not found
+        if not key in ['ask_for_file', 'file_path']:
+            values.append(my_data.get(key, value))
+    return tuple(values)
+def train_model(
+    pretrained_model_name_or_path,
+    v2,
+    v_parameterization,
+    logging_dir,
+    train_data_dir,
+    reg_data_dir,
+    output_dir,
+    max_resolution,
+    learning_rate,
+    lr_scheduler,
+    lr_warmup,
+    train_batch_size,
+    epoch,
+    save_every_n_epochs,
+    mixed_precision,
+    save_precision,
+    seed,
+    num_cpu_threads_per_process,
+    cache_latents,
+    caption_extension,
+    enable_bucket,
+    gradient_checkpointing,
+    full_fp16,
+    no_token_padding,
+    stop_text_encoder_training_pct,
+    # use_8bit_adam,
+    xformers,
+    save_model_as,
+    shuffle_caption,
+    save_state,
+    resume,
+    prior_loss_weight,
+    color_aug,
+    flip_aug,
+    clip_skip,
+    vae,
+    output_name,
+    max_token_length,
+    max_train_epochs,
+    max_data_loader_n_workers,
+    mem_eff_attn,
+    gradient_accumulation_steps,
+    model_list,  # Keep this. Yes, it is unused here but required given the common list used
+    keep_tokens,
+    persistent_data_loader_workers,
+    bucket_no_upscale,
+    random_crop,
+    bucket_reso_steps,
+    caption_dropout_every_n_epochs,
+    caption_dropout_rate,
+    optimizer,
+    optimizer_args,
+    noise_offset,
+    sample_every_n_steps,
+    sample_every_n_epochs,
+    sample_sampler,
+    sample_prompts,
+    additional_parameters,
+    vae_batch_size,
+    min_snr_gamma,
+):
+    if pretrained_model_name_or_path == '':
+        msgbox('Source model information is missing')
+        return
+    if train_data_dir == '':
+        msgbox('Image folder path is missing')
+        return
+    if not os.path.exists(train_data_dir):
+        msgbox('Image folder does not exist')
+        return
+    if reg_data_dir != '':
+        if not os.path.exists(reg_data_dir):
+            msgbox('Regularisation folder does not exist')
+            return
+    if output_dir == '':
+        msgbox('Output folder path is missing')
+        return
+    if check_if_model_exist(output_name, output_dir, save_model_as):
+        return
+    # Get a list of all subfolders in train_data_dir, excluding hidden folders
+    subfolders = [
+        f
+        for f in os.listdir(train_data_dir)
+        if os.path.isdir(os.path.join(train_data_dir, f))
+        and not f.startswith('.')
+    ]
+    # Check if subfolders are present. If not let the user know and return
+    if not subfolders:
+        print(
+            '\033[33mNo subfolders were found in',
+            train_data_dir,
+            " can't train\...033[0m",
+        )
+        return
+    total_steps = 0
+    # Loop through each subfolder and extract the number of repeats
+    for folder in subfolders:
+        # Extract the number of repeats from the folder name
+        try:
+            repeats = int(folder.split('_')[0])
+        except ValueError:
+            print(
+                '\033[33mSubfolder',
+                folder,
+                "does not have a proper repeat value, please correct the name or remove it... can't train...\033[0m",
+            )
+            continue
+        # Count the number of images in the folder
+        num_images = len(
+            [
+                f
+                for f, lower_f in (
+                    (file, file.lower())
+                    for file in os.listdir(
+                        os.path.join(train_data_dir, folder)
+                    )
+                )
+                if lower_f.endswith(('.jpg', '.jpeg', '.png', '.webp'))
+            ]
+        )
+        if num_images == 0:
+            print(f'{folder} folder contain no images, skipping...')
+        else:
+            # Calculate the total number of steps for this folder
+            steps = repeats * num_images
+            total_steps += steps
+            # Print the result
+            print('\033[33mFolder', folder, ':', steps, 'steps\033[0m')
+    if total_steps == 0:
+        print(
+            '\033[33mNo images were found in folder',
+            train_data_dir,
+            '... please rectify!\033[0m',
+        )
+        return
+    # Print the result
+    # print(f"{total_steps} total steps")
+    if reg_data_dir == '':
+        reg_factor = 1
+    else:
+        print(
+            '\033[94mRegularisation images are used... Will double the number of steps required...\033[0m'
+        )
+        reg_factor = 2
+    # calculate max_train_steps
+    max_train_steps = int(
+        math.ceil(
+            float(total_steps)
+            / int(train_batch_size)
+            * int(epoch)
+            * int(reg_factor)
+        )
+    )
+    print(f'max_train_steps = {max_train_steps}')
+    # calculate stop encoder training
+    if int(stop_text_encoder_training_pct) == -1:
+        stop_text_encoder_training = -1
+    elif stop_text_encoder_training_pct == None:
+        stop_text_encoder_training = 0
+    else:
+        stop_text_encoder_training = math.ceil(
+            float(max_train_steps) / 100 * int(stop_text_encoder_training_pct)
+        )
+    print(f'stop_text_encoder_training = {stop_text_encoder_training}')
+    lr_warmup_steps = round(float(int(lr_warmup) * int(max_train_steps) / 100))
+    print(f'lr_warmup_steps = {lr_warmup_steps}')
+    run_cmd = f'accelerate launch --num_cpu_threads_per_process={num_cpu_threads_per_process} "train_db.py"'
+    if v2:
+        run_cmd += ' --v2'
+    if v_parameterization:
+        run_cmd += ' --v_parameterization'
+    if enable_bucket:
+        run_cmd += ' --enable_bucket'
+    if no_token_padding:
+        run_cmd += ' --no_token_padding'
+    run_cmd += (
+        f' --pretrained_model_name_or_path="{pretrained_model_name_or_path}"'
+    )
+    run_cmd += f' --train_data_dir="{train_data_dir}"'
+    if len(reg_data_dir):
+        run_cmd += f' --reg_data_dir="{reg_data_dir}"'
+    run_cmd += f' --resolution={max_resolution}'
+    run_cmd += f' --output_dir="{output_dir}"'
+    run_cmd += f' --logging_dir="{logging_dir}"'
+    if not stop_text_encoder_training == 0:
+        run_cmd += (
+            f' --stop_text_encoder_training={stop_text_encoder_training}'
+        )
+    if not save_model_as == 'same as source model':
+        run_cmd += f' --save_model_as={save_model_as}'
+    # if not resume == '':
+    #     run_cmd += f' --resume={resume}'
+    if not float(prior_loss_weight) == 1.0:
+        run_cmd += f' --prior_loss_weight={prior_loss_weight}'
+    if not vae == '':
+        run_cmd += f' --vae="{vae}"'
+    if not output_name == '':
+        run_cmd += f' --output_name="{output_name}"'
+    if int(max_token_length) > 75:
+        run_cmd += f' --max_token_length={max_token_length}'
+    if not max_train_epochs == '':
+        run_cmd += f' --max_train_epochs="{max_train_epochs}"'
+    if not max_data_loader_n_workers == '':
+        run_cmd += (
+            f' --max_data_loader_n_workers="{max_data_loader_n_workers}"'
+        )
+    if int(gradient_accumulation_steps) > 1:
+        run_cmd += f' --gradient_accumulation_steps={int(gradient_accumulation_steps)}'
+    run_cmd += run_cmd_training(
+        learning_rate=learning_rate,
+        lr_scheduler=lr_scheduler,
+        lr_warmup_steps=lr_warmup_steps,
+        train_batch_size=train_batch_size,
+        max_train_steps=max_train_steps,
+        save_every_n_epochs=save_every_n_epochs,
+        mixed_precision=mixed_precision,
+        save_precision=save_precision,
+        seed=seed,
+        caption_extension=caption_extension,
+        cache_latents=cache_latents,
+        optimizer=optimizer,
+        optimizer_args=optimizer_args,
+    )
+    run_cmd += run_cmd_advanced_training(
+        max_train_epochs=max_train_epochs,
+        max_data_loader_n_workers=max_data_loader_n_workers,
+        max_token_length=max_token_length,
+        resume=resume,
+        save_state=save_state,
+        mem_eff_attn=mem_eff_attn,
+        clip_skip=clip_skip,
+        flip_aug=flip_aug,
+        color_aug=color_aug,
+        shuffle_caption=shuffle_caption,
+        gradient_checkpointing=gradient_checkpointing,
+        full_fp16=full_fp16,
+        xformers=xformers,
+        # use_8bit_adam=use_8bit_adam,
+        keep_tokens=keep_tokens,
+        persistent_data_loader_workers=persistent_data_loader_workers,
+        bucket_no_upscale=bucket_no_upscale,
+        random_crop=random_crop,
+        bucket_reso_steps=bucket_reso_steps,
+        caption_dropout_every_n_epochs=caption_dropout_every_n_epochs,
+        caption_dropout_rate=caption_dropout_rate,
+        noise_offset=noise_offset,
+        additional_parameters=additional_parameters,
+        vae_batch_size=vae_batch_size,
+        min_snr_gamma=min_snr_gamma,
+    )
+    run_cmd += run_cmd_sample(
+        sample_every_n_steps,
+        sample_every_n_epochs,
+        sample_sampler,
+        sample_prompts,
+        output_dir,
+    )
+    print(run_cmd)
+    # Run the command
+    if os.name == 'posix':
+        os.system(run_cmd)
+    else:
+        subprocess.run(run_cmd)
+    # check if output_dir/last is a folder... therefore it is a diffuser model
+    last_dir = pathlib.Path(f'{output_dir}/{output_name}')
+    if not last_dir.is_dir():
+        # Copy inference model for v2 if required
+        save_inference_file(output_dir, v2, v_parameterization, output_name)
+def dreambooth_tab(
+    train_data_dir=gr.Textbox(),
+    reg_data_dir=gr.Textbox(),
+    output_dir=gr.Textbox(),
+    logging_dir=gr.Textbox(),
+):
+    dummy_db_true = gr.Label(value=True, visible=False)
+    dummy_db_false = gr.Label(value=False, visible=False)
+    gr.Markdown('Train a custom model using kohya dreambooth python code...')
+    (
+        button_open_config,
+        button_save_config,
+        button_save_as_config,
+        config_file_name,
+        button_load_config,
+    ) = gradio_config()
+    (
+        pretrained_model_name_or_path,
+        v2,
+        v_parameterization,
+        save_model_as,
+        model_list,
+    ) = gradio_source_model()
+    with gr.Tab('Folders'):
+        with gr.Row():
+            train_data_dir = gr.Textbox(
+                label='Image folder',
+                placeholder='Folder where the training folders containing the images are located',
+            )
+            train_data_dir_input_folder = gr.Button(
+                '📂', elem_id='open_folder_small'
+            )
+            train_data_dir_input_folder.click(
+                get_folder_path,
+                outputs=train_data_dir,
+                show_progress=False,
+            )
+            reg_data_dir = gr.Textbox(
+                label='Regularisation folder',
+                placeholder='(Optional) Folder where where the regularization folders containing the images are located',
+            )
+            reg_data_dir_input_folder = gr.Button(
+                '📂', elem_id='open_folder_small'
+            )
+            reg_data_dir_input_folder.click(
+                get_folder_path,
+                outputs=reg_data_dir,
+                show_progress=False,
+            )
+        with gr.Row():
+            output_dir = gr.Textbox(
+                label='Model output folder',
+                placeholder='Folder to output trained model',
+            )
+            output_dir_input_folder = gr.Button(
+                '📂', elem_id='open_folder_small'
+            )
+            output_dir_input_folder.click(get_folder_path, outputs=output_dir)
+            logging_dir = gr.Textbox(
+                label='Logging folder',
+                placeholder='Optional: enable logging and output TensorBoard log to this folder',
+            )
+            logging_dir_input_folder = gr.Button(
+                '📂', elem_id='open_folder_small'
+            )
+            logging_dir_input_folder.click(
+                get_folder_path,
+                outputs=logging_dir,
+                show_progress=False,
+            )
+        with gr.Row():
+            output_name = gr.Textbox(
+                label='Model output name',
+                placeholder='Name of the model to output',
+                value='last',
+                interactive=True,
+            )
+        train_data_dir.change(
+            remove_doublequote,
+            inputs=[train_data_dir],
+            outputs=[train_data_dir],
+        )
+        reg_data_dir.change(
+            remove_doublequote,
+            inputs=[reg_data_dir],
+            outputs=[reg_data_dir],
+        )
+        output_dir.change(
+            remove_doublequote,
+            inputs=[output_dir],
+            outputs=[output_dir],
+        )
+        logging_dir.change(
+            remove_doublequote,
+            inputs=[logging_dir],
+            outputs=[logging_dir],
+        )
+    with gr.Tab('Training parameters'):
+        (
+            learning_rate,
+            lr_scheduler,
+            lr_warmup,
+            train_batch_size,
+            epoch,
+            save_every_n_epochs,
+            mixed_precision,
+            save_precision,
+            num_cpu_threads_per_process,
+            seed,
+            caption_extension,
+            cache_latents,
+            optimizer,
+            optimizer_args,
+        ) = gradio_training(
+            learning_rate_value='1e-5',
+            lr_scheduler_value='cosine',
+            lr_warmup_value='10',
+        )
+        with gr.Row():
+            max_resolution = gr.Textbox(
+                label='Max resolution',
+                value='512,512',
+                placeholder='512,512',
+            )
+            stop_text_encoder_training = gr.Slider(
+                minimum=-1,
+                maximum=100,
+                value=0,
+                step=1,
+                label='Stop text encoder training',
+            )
+            enable_bucket = gr.Checkbox(label='Enable buckets', value=True)
+        with gr.Accordion('Advanced Configuration', open=False):
+            with gr.Row():
+                no_token_padding = gr.Checkbox(
+                    label='No token padding', value=False
+                )
+                gradient_accumulation_steps = gr.Number(
+                    label='Gradient accumulate steps', value='1'
+                )
+            with gr.Row():
+                prior_loss_weight = gr.Number(
+                    label='Prior loss weight', value=1.0
+                )
+                vae = gr.Textbox(
+                    label='VAE',
+                    placeholder='(Optiona) path to checkpoint of vae to replace for training',
+                )
+                vae_button = gr.Button('📂', elem_id='open_folder_small')
+                vae_button.click(
+                    get_any_file_path,
+                    outputs=vae,
+                    show_progress=False,
+                )
+            (
+                # use_8bit_adam,
+                xformers,
+                full_fp16,
+                gradient_checkpointing,
+                shuffle_caption,
+                color_aug,
+                flip_aug,
+                clip_skip,
+                mem_eff_attn,
+                save_state,
+                resume,
+                max_token_length,
+                max_train_epochs,
+                max_data_loader_n_workers,
+                keep_tokens,
+                persistent_data_loader_workers,
+                bucket_no_upscale,
+                random_crop,
+                bucket_reso_steps,
+                caption_dropout_every_n_epochs,
+                caption_dropout_rate,
+                noise_offset,
+                additional_parameters,
+                vae_batch_size,
+                min_snr_gamma,
+            ) = gradio_advanced_training()
+            color_aug.change(
+                color_aug_changed,
+                inputs=[color_aug],
+                outputs=[cache_latents],
+            )
+        (
+            sample_every_n_steps,
+            sample_every_n_epochs,
+            sample_sampler,
+            sample_prompts,
+        ) = sample_gradio_config()
+    with gr.Tab('Tools'):
+        gr.Markdown(
+            'This section provide Dreambooth tools to help setup your dataset...'
+        )
+        gradio_dreambooth_folder_creation_tab(
+            train_data_dir_input=train_data_dir,
+            reg_data_dir_input=reg_data_dir,
+            output_dir_input=output_dir,
+            logging_dir_input=logging_dir,
+        )
+    button_run = gr.Button('Train model', variant='primary')
+    # Setup gradio tensorboard buttons
+    button_start_tensorboard, button_stop_tensorboard = gradio_tensorboard()
+    button_start_tensorboard.click(
+        start_tensorboard,
+        inputs=logging_dir,
+        show_progress=False,
+    )
+    button_stop_tensorboard.click(
+        stop_tensorboard,
+        show_progress=False,
+    )
+    settings_list = [
+        pretrained_model_name_or_path,
+        v2,
+        v_parameterization,
+        logging_dir,
+        train_data_dir,
+        reg_data_dir,
+        output_dir,
+        max_resolution,
+        learning_rate,
+        lr_scheduler,
+        lr_warmup,
+        train_batch_size,
+        epoch,
+        save_every_n_epochs,
+        mixed_precision,
+        save_precision,
+        seed,
+        num_cpu_threads_per_process,
+        cache_latents,
+        caption_extension,
+        enable_bucket,
+        gradient_checkpointing,
+        full_fp16,
+        no_token_padding,
+        stop_text_encoder_training,
+        # use_8bit_adam,
+        xformers,
+        save_model_as,
+        shuffle_caption,
+        save_state,
+        resume,
+        prior_loss_weight,
+        color_aug,
+        flip_aug,
+        clip_skip,
+        vae,
+        output_name,
+        max_token_length,
+        max_train_epochs,
+        max_data_loader_n_workers,
+        mem_eff_attn,
+        gradient_accumulation_steps,
+        model_list,
+        keep_tokens,
+        persistent_data_loader_workers,
+        bucket_no_upscale,
+        random_crop,
+        bucket_reso_steps,
+        caption_dropout_every_n_epochs,
+        caption_dropout_rate,
+        optimizer,
+        optimizer_args,
+        noise_offset,
+        sample_every_n_steps,
+        sample_every_n_epochs,
+        sample_sampler,
+        sample_prompts,
+        additional_parameters,
+        vae_batch_size,
+        min_snr_gamma,
+    ]
+    button_open_config.click(
+        open_configuration,
+        inputs=[dummy_db_true, config_file_name] + settings_list,
+        outputs=[config_file_name] + settings_list,
+        show_progress=False,
+    )
+    button_load_config.click(
+        open_configuration,
+        inputs=[dummy_db_false, config_file_name] + settings_list,
+        outputs=[config_file_name] + settings_list,
+        show_progress=False,
+    )
+    button_save_config.click(
+        save_configuration,
+        inputs=[dummy_db_false, config_file_name] + settings_list,
+        outputs=[config_file_name],
+        show_progress=False,
+    )
+    button_save_as_config.click(
+        save_configuration,
+        inputs=[dummy_db_true, config_file_name] + settings_list,
+        outputs=[config_file_name],
+        show_progress=False,
+    )
+    button_run.click(
+        train_model,
+        inputs=settings_list,
+        show_progress=False,
+    )
+    return (
+        train_data_dir,
+        reg_data_dir,
+        output_dir,
+        logging_dir,
+    )
+def UI(**kwargs):
+    css = ''
+    if os.path.exists('./style.css'):
+        with open(os.path.join('./style.css'), 'r', encoding='utf8') as file:
+            print('Load CSS...')
+            css += file.read() + '\n'
+    interface = gr.Blocks(css=css)
+    with interface:
+        with gr.Tab('Dreambooth'):
+            (
+                train_data_dir_input,
+                reg_data_dir_input,
+                output_dir_input,
+                logging_dir_input,
+            ) = dreambooth_tab()
+        with gr.Tab('Utilities'):
+            utilities_tab(
+                train_data_dir_input=train_data_dir_input,
+                reg_data_dir_input=reg_data_dir_input,
+                output_dir_input=output_dir_input,
+                logging_dir_input=logging_dir_input,
+                enable_copy_info_button=True,
+            )
+    # Show the interface
+    launch_kwargs = {}
+    if not kwargs.get('username', None) == '':
+        launch_kwargs['auth'] = (
+            kwargs.get('username', None),
+            kwargs.get('password', None),
+        )
+    if kwargs.get('server_port', 0) > 0:
+        launch_kwargs['server_port'] = kwargs.get('server_port', 0)
+    if kwargs.get('inbrowser', False):
+        launch_kwargs['inbrowser'] = kwargs.get('inbrowser', False)
+    print(launch_kwargs)
+    interface.launch(**launch_kwargs)
+if __name__ == '__main__':
+    # torch.cuda.set_per_process_memory_fraction(0.48)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--username', type=str, default='', help='Username for authentication'
+    )
+    parser.add_argument(
+        '--password', type=str, default='', help='Password for authentication'
+    )
+    parser.add_argument(
+        '--server_port',
+        type=int,
+        default=0,
+        help='Port to run the server listener on',
+    )
+    parser.add_argument(
+        '--inbrowser', action='store_true', help='Open in browser'
+    )
+    args = parser.parse_args()
+    UI(
+        username=args.username,
+        password=args.password,
+        inbrowser=args.inbrowser,
+        server_port=args.server_port,
+    )

fine_tune.py ADDED Viewed

	@@ -0,0 +1,430 @@

+# training with captions
+# XXX dropped option: hypernetwork training
+import argparse
+import gc
+import math
+import os
+import toml
+from multiprocessing import Value
+from tqdm import tqdm
+import torch
+from accelerate.utils import set_seed
+import diffusers
+from diffusers import DDPMScheduler
+import library.train_util as train_util
+import library.config_util as config_util
+from library.config_util import (
+    ConfigSanitizer,
+    BlueprintGenerator,
+)
+import library.custom_train_functions as custom_train_functions
+from library.custom_train_functions import apply_snr_weight
+def train(args):
+    train_util.verify_training_args(args)
+    train_util.prepare_dataset_args(args, True)
+    cache_latents = args.cache_latents
+    if args.seed is not None:
+        set_seed(args.seed)  # 乱数系列を初期化する
+    tokenizer = train_util.load_tokenizer(args)
+    blueprint_generator = BlueprintGenerator(ConfigSanitizer(False, True, True))
+    if args.dataset_config is not None:
+        print(f"Load dataset config from {args.dataset_config}")
+        user_config = config_util.load_user_config(args.dataset_config)
+        ignored = ["train_data_dir", "in_json"]
+        if any(getattr(args, attr) is not None for attr in ignored):
+            print(
+                "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                    ", ".join(ignored)
+                )
+            )
+    else:
+        user_config = {
+            "datasets": [
+                {
+                    "subsets": [
+                        {
+                            "image_dir": args.train_data_dir,
+                            "metadata_file": args.in_json,
+                        }
+                    ]
+                }
+            ]
+        }
+    blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
+    train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
+    current_epoch = Value("i", 0)
+    current_step = Value("i", 0)
+    ds_for_collater = train_dataset_group if args.max_data_loader_n_workers == 0 else None
+    collater = train_util.collater_class(current_epoch, current_step, ds_for_collater)
+    if args.debug_dataset:
+        train_util.debug_dataset(train_dataset_group)
+        return
+    if len(train_dataset_group) == 0:
+        print(
+            "No data found. Please verify the metadata file and train_data_dir option. / 画像がありません。メタデータおよびtrain_data_dirオプションを確認してください。"
+        )
+        return
+    if cache_latents:
+        assert (
+            train_dataset_group.is_latent_cacheable()
+        ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
+    # acceleratorを準備する
+    print("prepare accelerator")
+    accelerator, unwrap_model = train_util.prepare_accelerator(args)
+    # mixed precisionに対応した型を用意しておき適宜castする
+    weight_dtype, save_dtype = train_util.prepare_dtype(args)
+    # モデルを読み込む
+    text_encoder, vae, unet, load_stable_diffusion_format = train_util.load_target_model(args, weight_dtype)
+    # verify load/save model formats
+    if load_stable_diffusion_format:
+        src_stable_diffusion_ckpt = args.pretrained_model_name_or_path
+        src_diffusers_model_path = None
+    else:
+        src_stable_diffusion_ckpt = None
+        src_diffusers_model_path = args.pretrained_model_name_or_path
+    if args.save_model_as is None:
+        save_stable_diffusion_format = load_stable_diffusion_format
+        use_safetensors = args.use_safetensors
+    else:
+        save_stable_diffusion_format = args.save_model_as.lower() == "ckpt" or args.save_model_as.lower() == "safetensors"
+        use_safetensors = args.use_safetensors or ("safetensors" in args.save_model_as.lower())
+    # Diffusers版のxformers使用フラグを設定する関数
+    def set_diffusers_xformers_flag(model, valid):
+        #   model.set_use_memory_efficient_attention_xformers(valid)            # 次のリリースでなくなりそう
+        # pipeが自動で再帰的にset_use_memory_efficient_attention_xformersを探すんだって(;´Д｀)
+        # U-Netだけ使う時にはどうすればいいのか……仕方ないからコピって使うか
+        # 0.10.2でなんか巻き戻って個別に指定するようになった(;^ω^)
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid)
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+        fn_recursive_set_mem_eff(model)
+    # モデルに xformers とか memory efficient attention を組み込む
+    if args.diffusers_xformers:
+        print("Use xformers by Diffusers")
+        set_diffusers_xformers_flag(unet, True)
+    else:
+        # Windows版のxformersはfloatで学習できないのでxformersを使わない設定も可能にしておく必要がある
+        print("Disable Diffusers' xformers")
+        set_diffusers_xformers_flag(unet, False)
+        train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers)
+    # 学習を準備する
+    if cache_latents:
+        vae.to(accelerator.device, dtype=weight_dtype)
+        vae.requires_grad_(False)
+        vae.eval()
+        with torch.no_grad():
+            train_dataset_group.cache_latents(vae, args.vae_batch_size)
+        vae.to("cpu")
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        gc.collect()
+    # 学習を準備する：モデルを適切な状態にする
+    training_models = []
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+    training_models.append(unet)
+    if args.train_text_encoder:
+        print("enable text encoder training")
+        if args.gradient_checkpointing:
+            text_encoder.gradient_checkpointing_enable()
+        training_models.append(text_encoder)
+    else:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+        text_encoder.requires_grad_(False)  # text encoderは学習しない
+        if args.gradient_checkpointing:
+            text_encoder.gradient_checkpointing_enable()
+            text_encoder.train()  # required for gradient_checkpointing
+        else:
+            text_encoder.eval()
+    if not cache_latents:
+        vae.requires_grad_(False)
+        vae.eval()
+        vae.to(accelerator.device, dtype=weight_dtype)
+    for m in training_models:
+        m.requires_grad_(True)
+    params = []
+    for m in training_models:
+        params.extend(m.parameters())
+    params_to_optimize = params
+    # 学習に必要なクラスを準備する
+    print("prepare optimizer, data loader etc.")
+    _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
+    # dataloaderを準備する
+    # DataLoaderのプロセス数：0はメインプロセスになる
+    n_workers = min(args.max_data_loader_n_workers, os.cpu_count() - 1)  # cpu_count-1 ただし最大で指定された数まで
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset_group,
+        batch_size=1,
+        shuffle=True,
+        collate_fn=collater,
+        num_workers=n_workers,
+        persistent_workers=args.persistent_data_loader_workers,
+    )
+    # 学習ステップ数を計算する
+    if args.max_train_epochs is not None:
+        args.max_train_steps = args.max_train_epochs * math.ceil(
+            len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps
+        )
+        print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}")
+    # データセット側にも学習ステップを送信
+    train_dataset_group.set_max_train_steps(args.max_train_steps)
+    # lr schedulerを用意する
+    lr_scheduler = train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes)
+    # 実験的機能：勾配も含めたfp16学習を行う　モデル全体をfp16にする
+    if args.full_fp16:
+        assert (
+            args.mixed_precision == "fp16"
+        ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
+        print("enable full fp16 training.")
+        unet.to(weight_dtype)
+        text_encoder.to(weight_dtype)
+    # acceleratorがなんかよろしくやってくれるらしい
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
+    # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
+    if args.full_fp16:
+        train_util.patch_accelerator_for_fp16_training(accelerator)
+    # resumeする
+    if args.resume is not None:
+        print(f"resume training from state: {args.resume}")
+        accelerator.load_state(args.resume)
+    # epoch数を計算する
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    if (args.save_n_epoch_ratio is not None) and (args.save_n_epoch_ratio > 0):
+        args.save_every_n_epochs = math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
+    # 学習する
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    print("running training / 学習開始")
+    print(f"  num examples / サンプル数: {train_dataset_group.num_train_images}")
+    print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
+    print(f"  num epochs / epoch数: {num_train_epochs}")
+    print(f"  batch size per device / バッチサイズ: {args.train_batch_size}")
+    print(f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}")
+    print(f"  gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
+    print(f"  total optimization steps / 学習ステップ数: {args.max_train_steps}")
+    progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
+    global_step = 0
+    noise_scheduler = DDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, clip_sample=False
+    )
+    if accelerator.is_main_process:
+        accelerator.init_trackers("finetuning")
+    for epoch in range(num_train_epochs):
+        print(f"epoch {epoch+1}/{num_train_epochs}")
+        current_epoch.value = epoch + 1
+        for m in training_models:
+            m.train()
+        loss_total = 0
+        for step, batch in enumerate(train_dataloader):
+            current_step.value = global_step
+            with accelerator.accumulate(training_models[0]):  # 複数モデルに対応していない模様だがとりあえずこうしておく
+                with torch.no_grad():
+                    if "latents" in batch and batch["latents"] is not None:
+                        latents = batch["latents"].to(accelerator.device)
+                    else:
+                        # latentに変換
+                        latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample()
+                    latents = latents * 0.18215
+                b_size = latents.shape[0]
+                with torch.set_grad_enabled(args.train_text_encoder):
+                    # Get the text embedding for conditioning
+                    input_ids = batch["input_ids"].to(accelerator.device)
+                    encoder_hidden_states = train_util.get_hidden_states(
+                        args, input_ids, tokenizer, text_encoder, None if not args.full_fp16 else weight_dtype
+                    )
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents, device=latents.device)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (b_size,), device=latents.device)
+                timesteps = timesteps.long()
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                # Predict the noise residual
+                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                if args.v_parameterization:
+                    # v-parameterization training
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    target = noise
+                if args.min_snr_gamma:
+                    # do not mean over batch dimension for snr weight
+                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean([1, 2, 3])
+                    loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma)
+                    loss = loss.mean()  # mean over batch dimension
+                else:
+                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="mean")
+                accelerator.backward(loss)
+                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                    params_to_clip = []
+                    for m in training_models:
+                        params_to_clip.extend(m.parameters())
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad(set_to_none=True)
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                train_util.sample_images(
+                    accelerator, args, None, global_step, accelerator.device, vae, tokenizer, text_encoder, unet
+                )
+            current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
+            if args.logging_dir is not None:
+                logs = {"loss": current_loss, "lr": float(lr_scheduler.get_last_lr()[0])}
+                if args.optimizer_type.lower() == "DAdaptation".lower():  # tracking d*lr value
+                    logs["lr/d*lr"] = (
+                        lr_scheduler.optimizers[0].param_groups[0]["d"] * lr_scheduler.optimizers[0].param_groups[0]["lr"]
+                    )
+                accelerator.log(logs, step=global_step)
+            # TODO moving averageにする
+            loss_total += current_loss
+            avr_loss = loss_total / (step + 1)
+            logs = {"loss": avr_loss}  # , "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            if global_step >= args.max_train_steps:
+                break
+        if args.logging_dir is not None:
+            logs = {"loss/epoch": loss_total / len(train_dataloader)}
+            accelerator.log(logs, step=epoch + 1)
+        accelerator.wait_for_everyone()
+        if args.save_every_n_epochs is not None:
+            src_path = src_stable_diffusion_ckpt if save_stable_diffusion_format else src_diffusers_model_path
+            train_util.save_sd_model_on_epoch_end(
+                args,
+                accelerator,
+                src_path,
+                save_stable_diffusion_format,
+                use_safetensors,
+                save_dtype,
+                epoch,
+                num_train_epochs,
+                global_step,
+                unwrap_model(text_encoder),
+                unwrap_model(unet),
+                vae,
+            )
+        train_util.sample_images(accelerator, args, epoch + 1, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+    is_main_process = accelerator.is_main_process
+    if is_main_process:
+        unet = unwrap_model(unet)
+        text_encoder = unwrap_model(text_encoder)
+    accelerator.end_training()
+    if args.save_state:
+        train_util.save_state_on_train_end(args, accelerator)
+    del accelerator  # この後メモリを使うのでこれは消す
+    if is_main_process:
+        src_path = src_stable_diffusion_ckpt if save_stable_diffusion_format else src_diffusers_model_path
+        train_util.save_sd_model_on_train_end(
+            args, src_path, save_stable_diffusion_format, use_safetensors, save_dtype, epoch, global_step, text_encoder, unet, vae
+        )
+        print("model saved.")
+def setup_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    train_util.add_sd_models_arguments(parser)
+    train_util.add_dataset_arguments(parser, False, True, True)
+    train_util.add_training_arguments(parser, False)
+    train_util.add_sd_saving_arguments(parser)
+    train_util.add_optimizer_arguments(parser)
+    config_util.add_config_arguments(parser)
+    custom_train_functions.add_custom_train_arguments(parser)
+    parser.add_argument("--diffusers_xformers", action="store_true", help="use xformers by diffusers / Diffusersでxformersを使用する")
+    parser.add_argument("--train_text_encoder", action="store_true", help="train text encoder / text encoderも学習する")
+    return parser
+if __name__ == "__main__":
+    parser = setup_parser()
+    args = parser.parse_args()
+    args = train_util.read_config_from_file(args, parser)
+    train(args)

fine_tune_README.md ADDED Viewed

	@@ -0,0 +1,465 @@

+It is a fine tuning that corresponds to NovelAI's proposed learning method, automatic captioning, tagging, Windows + VRAM 12GB (for v1.4/1.5) environment, etc.
+## overview
+Fine tuning of U-Net of Stable Diffusion using Diffusers. It corresponds to the following improvements in NovelAI's article (For Aspect Ratio Bucketing, I referred to NovelAI's code, but the final code is all original).
+* Use the output of the penultimate layer instead of the last layer of CLIP (Text Encoder).
+* Learning at non-square resolutions (Aspect Ratio Bucketing).
+* Extend token length from 75 to 225.
+* Captioning with BLIP (automatic creation of captions), automatic tagging with DeepDanbooru or WD14Tagger.
+* Also supports Hypernetwork learning.
+* Supports Stable Diffusion v2.0 (base and 768/v).
+* By acquiring the output of VAE in advance and saving it to disk, we aim to save memory and speed up learning.
+Text Encoder is not trained by default. For fine tuning of the whole model, it seems common to learn only U-Net (NovelAI seems to be the same). Text Encoder can also be learned as an option.
+## Additional features
+### Change CLIP output
+CLIP (Text Encoder) converts the text into features in order to reflect the prompt in the image. Stable diffusion uses the output of the last layer of CLIP, but you can change it to use the output of the penultimate layer. According to NovelAI, this will reflect prompts more accurately.
+It is also possible to use the output of the last layer as is.
+*Stable Diffusion 2.0 uses the penultimate layer by default. Do not specify the clip_skip option.
+### Training in non-square resolutions
+Stable Diffusion is trained at 512\*512, but also at resolutions such as 256\*1024 and 384\*640. It is expected that this will reduce the cropped portion and learn the relationship between prompts and images more correctly.
+The learning resolution is adjusted vertically and horizontally in units of 64 pixels within a range that does not exceed the resolution area (= memory usage) given as a parameter.
+In machine learning, it is common to unify all input sizes, but there are no particular restrictions, and in fact it is okay as long as they are unified within the same batch. NovelAI's bucketing seems to refer to classifying training data in advance for each learning resolution according to the aspect ratio. And by creating a batch with the images in each bucket, the image size of the batch is unified.
+### Extending token length from 75 to 225
+Stable diffusion has a maximum of 75 tokens (77 tokens including the start and end), but we will extend it to 225 tokens.
+However, the maximum length that CLIP accepts is 75 tokens, so in the case of 225 tokens, we simply divide it into thirds, call CLIP, and then concatenate the results.
+*I'm not sure if this is the preferred implementation. It seems to be working for now. Especially in 2.0, there is no implementation that can be used as a reference, so I have implemented it independently.
+*Automatic1111's Web UI seems to divide the text with commas in mind, but in my case, it's a simple division.
+## Environmental arrangement
+See the [README](./README-en.md) in this repository.
+## Preparing teacher data
+Prepare the image data you want to learn and put it in any folder. No prior preparation such as resizing is required.
+However, for images that are smaller than the training resolution, it is recommended to enlarge them while maintaining the quality using super-resolution.
+It also supports multiple teacher data folders. Preprocessing will be executed for each folder.
+For example, store an image like this:
+![Teacher data folder screenshot](https://user-images.githubusercontent.com/52813779/208907739-8e89d5fa-6ca8-4b60-8927-f484d2a9ae04.png)
+## Automatic captioning
+Skip if you just want to learn tags without captions.
+Also, when preparing captions manually, prepare them in the same directory as the teacher data image, with the same file name, extension .caption, etc. Each file should be a text file with only one line.
+### Captioning with BLIP
+The latest version no longer requires BLIP downloads, weight downloads, and additional virtual environments. Works as-is.
+Run make_captions.py in the finetune folder.
+```
+python finetune\make_captions.py --batch_size <batch size> <teacher data folder>
+```
+If the batch size is 8 and the training data is placed in the parent folder train_data, it will be as follows.
+```
+python finetune\make_captions.py --batch_size 8 ..\train_data
+```
+A caption file is created in the same directory as the teacher data image with the same file name and extension .caption.
+Increase or decrease batch_size according to the VRAM capacity of the GPU. Bigger is faster (I think 12GB of VRAM can be a little more).
+You can specify the maximum length of the caption with the max_length option. Default is 75. It may be longer if the model is trained with a token length of 225.
+You can change the caption extension with the caption_extension option. Default is .caption (.txt conflicts with DeepDanbooru described later).
+If there are multiple teacher data folders, execute for each folder.
+Note that the inference is random, so the results will change each time you run it. If you want to fix it, specify a random number seed like "--seed 42" with the --seed option.
+For other options, please refer to the help with --help (there seems to be no documentation for the meaning of the parameters, so you have to look at the source).
+A caption file is generated with the extension .caption by default.
+![Folder where caption is generated](https://user-images.githubusercontent.com/52813779/208908845-48a9d36c-f6ee-4dae-af71-9ab462d1459e.png)
+For example, with captions like:
+![captions and images](https://user-images.githubusercontent.com/52813779/208908947-af936957-5d73-4339-b6c8-945a52857373.png)
+## Tagged by DeepDanbooru
+If you do not want to tag the danbooru tag itself, please proceed to "Preprocessing of caption and tag information".
+Tagging is done with DeepDanbooru or WD14Tagger. WD14Tagger seems to be more accurate. If you want to tag with WD14Tagger, skip to the next chapter.
+### Environmental arrangement
+Clone DeepDanbooru https://github.com/KichangKim/DeepDanbooru into your working folder, or download the zip and extract it. I unzipped it.
+Also, download deepdanbooru-v3-20211112-sgd-e28.zip from Assets of "DeepDanbooru Pretrained Model v3-20211112-sgd-e28" on the DeepDanbooru Releases page https://github.com/KichangKim/DeepDanbooru/releases and extract it to the DeepDanbooru folder.
+Download from below. Click to open Assets and download from there.
+![DeepDanbooru download page](https://user-images.githubusercontent.com/52813779/208909417-10e597df-7085-41ee-bd06-3e856a1339df.png)
+Make a directory structure like this
+![DeepDanbooru directory structure](https://user-images.githubusercontent.com/52813779/208909486-38935d8b-8dc6-43f1-84d3-fef99bc471aa.png)
+Install the necessary libraries for the Diffusers environment. Go to the DeepDanbooru folder and install it (I think it's actually just adding tensorflow-io).
+```
+pip install -r requirements.txt
+```
+Next, install DeepDanbooru itself.
+```
+pip install .
+```
+This completes the preparation of the environment for tagging.
+### Implementing tagging
+Go to DeepDanbooru's folder and run deepdanbooru to tag.
+```
+deepdanbooru evaluate <teacher data folder> --project-path deepdanbooru-v3-20211112-sgd-e28 --allow-folder --save-txt
+```
+If you put the training data in the parent folder train_data, it will be as follows.
+```
+deepdanbooru evaluate ../train_data --project-path deepdanbooru-v3-20211112-sgd-e28 --allow-folder --save-txt
+```
+A tag file is created in the same directory as the teacher data image with the same file name and extension .txt. It is slow because it is processed one by one.
+If there are multiple teacher data folders, execute for each folder.
+It is generated as follows.
+![DeepDanbooru generated files](https://user-images.githubusercontent.com/52813779/208909855-d21b9c98-f2d3-4283-8238-5b0e5aad6691.png)
+A tag is attached like this (great amount of information...).
+![Deep Danbooru tag and image](https://user-images.githubusercontent.com/52813779/208909908-a7920174-266e-48d5-aaef-940aba709519.png)
+## Tagging with WD14Tagger
+This procedure uses WD14Tagger instead of DeepDanbooru.
+Use the tagger used in Mr. Automatic1111's WebUI. I referred to the information on this github page (https://github.com/toriato/stable-diffusion-webui-wd14-tagger#mrsmilingwolfs-model-aka-waifu-diffusion-14-tagger).
+The modules required for the initial environment maintenance have already been installed. Weights are automatically downloaded from Hugging Face.
+### Implementing tagging
+Run the script to do the tagging.
+```
+python tag_images_by_wd14_tagger.py --batch_size <batch size> <teacher data folder>
+```
+If you put the training data in the parent folder train_data, it will be as follows.
+```
+python tag_images_by_wd14_tagger.py --batch_size 4 ..\train_data
+```
+The model file will be automatically downloaded to the wd14_tagger_model folder on first launch (folder can be changed in options). It will be as follows.
+![downloaded file](https://user-images.githubusercontent.com/52813779/208910447-f7eb0582-90d6-49d3-a666-2b508c7d1842.png)
+A tag file is created in the same directory as the teacher data image with the same file name and extension .txt.
+![generated tag file](https://user-images.githubusercontent.com/52813779/208910534-ea514373-1185-4b7d-9ae3-61eb50bc294e.png)
+![tags and images](https://user-images.githubusercontent.com/52813779/208910599-29070c15-7639-474f-b3e4-06bd5a3df29e.png)
+With the thresh option, you can specify the number of confidences of the determined tag to attach the tag. The default is 0.35, same as the WD14Tagger sample. Lower values give more tags, but less accuracy.
+Increase or decrease batch_size according to the VRAM capacity of the GPU. Bigger is faster (I think 12GB of VRAM can be a little more). You can change the tag file extension with the caption_extension option. Default is .txt.
+You can specify the folder where the model is saved with the model_dir option.
+Also, if you specify the force_download option, the model will be re-downloaded even if there is a save destination folder.
+If there are multiple teacher data folders, execute for each folder.
+## Preprocessing caption and tag information
+Combine captions and tags into a single file as metadata for easy processing from scripts.
+### Caption preprocessing
+To put captions into the metadata, run the following in your working folder (if you don't use captions for learning, you don't need to run this) (it's actually a single line, and so on).
+```
+python merge_captions_to_metadata.py <teacher data folder>
+--in_json <metadata file name to read>
+     <metadata file name>
+```
+The metadata file name is an arbitrary name.
+If the training data is train_data, there is no metadata file to read, and the metadata file is meta_cap.json, it will be as follows.
+```
+python merge_captions_to_metadata.py train_data meta_cap.json
+```
+You can specify the caption extension with the caption_extension option.
+If there are multiple teacher data folders, please specify the full_path argument (metadata will have full path information). Then run it for each folder.
+```
+python merge_captions_to_metadata.py --full_path
+     train_data1 meta_cap1.json
+python merge_captions_to_metadata.py --full_path --in_json meta_cap1.json
+     train_data2 meta_cap2.json
+```
+If in_json is omitted, if there is a write destination metadata file, it will be read from there and overwritten there.
+__*It is safe to rewrite the in_json option and the write destination each time and write to a separate metadata file. __
+### Tag preprocessing
+Similarly, tags are also collected in metadata (no need to do this if tags are not used for learning).
+```
+python merge_dd_tags_to_metadata.py <teacher data folder>
+     --in_json <metadata file name to load>
+     <metadata file name to write>
+```
+With the same directory structure as above, when reading meta_cap.json and writing to meta_cap_dd.json, it will be as follows.
+```
+python merge_dd_tags_to_metadata.py train_data --in_json meta_cap.json meta_cap_dd.json
+```
+If you have multiple teacher data folders, please specify the full_path argument. Then run it for each folder.
+```
+python merge_dd_tags_to_metadata.py --full_path --in_json meta_cap2.json
+     train_data1 meta_cap_dd1.json
+python merge_dd_tags_to_metadata.py --full_path --in_json meta_cap_dd1.json
+     train_data2 meta_cap_dd2.json
+```
+If in_json is omitted, if there is a write destination metadata file, it will be read from there and overwritten there.
+__*It is safe to rewrite the in_json option and the write destination each time and write to a separate metadata file. __
+### Cleaning captions and tags
+Up to this point, captions and DeepDanbooru tags have been put together in the metadata file. However, captions with automatic captioning are subtle due to spelling variations (*), and tags include underscores and ratings (in the case of DeepDanbooru), so the editor's replacement function etc. You should use it to clean your captions and tags.
+*For example, when learning a girl in an anime picture, there are variations in captions such as girl/girls/woman/women. Also, it may be more appropriate to simply use "girl" for things like "anime girl".
+A script for cleaning is provided, so please edit the contents of the script according to the situation and use it.
+(It is no longer necessary to specify the teacher data folder. All data in the metadata will be cleaned.)
+```
+python clean_captions_and_tags.py <metadata file name to read> <metadata file name to write>
+```
+Please note that --in_json is not included. For example:
+```
+python clean_captions_and_tags.py meta_cap_dd.json meta_clean.json
+```
+Preprocessing of captions and tags is now complete.
+## Get latents in advance
+In order to speed up the learning, we acquire the latent representation of the image in advance and save it to disk. At the same time, bucketing (classifying the training data according to the aspect ratio) is performed.
+In your working folder, type:
+```
+python prepare_buckets_latents.py <teacher data folder>
+     <metadata file name to read> <metadata file name to write>
+     <model name or checkpoint for fine tuning>
+     --batch_size <batch size>
+     --max_resolution <resolution width, height>
+     --mixed_precision <precision>
+```
+If the model is model.ckpt, batch size 4, training resolution is 512\*512, precision is no (float32), read metadata from meta_clean.json and write to meta_lat.json:
+```
+python prepare_buckets_latents.py
+     train_data meta_clean.json meta_lat.json model.ckpt
+     --batch_size 4 --max_resolution 512,512 --mixed_precision no
+```
+Latents are saved in numpy npz format in the teacher data folder.
+Specify the --v2 option when loading a Stable Diffusion 2.0 model (--v_parameterization is not required).
+You can specify the minimum resolution size with the --min_bucket_reso option and the maximum size with the --max_bucket_reso option. The defaults are 256 and 1024 respectively. For example, specifying a minimum size of 384 will not use resolutions such as 256\*1024 or 320\*768.
+If you increase the resolution to something like 768\*768, you should specify something like 1280 for the maximum size.
+If you specify the --flip_aug option, it will perform horizontal flip augmentation (data augmentation). You can artificially double the amount of data, but if you specify it when the data is not left-right symmetrical (for example, character appearance, hairstyle, etc.), learning will not go well.
+(This is a simple implementation that acquires the latents for the flipped image and saves the \*\_flip.npz file. No options are required for fline_tune.py. If there is a file with \_flip, Randomly load a file without
+The batch size may be increased a little more even with 12GB of VRAM.
+The resolution is a number divisible by 64, and is specified by "width, height". The resolution is directly linked to the memory size during fine tuning. 512,512 seems to be the limit with VRAM 12GB (*). 16GB may be raised to 512,704 or 512,768. Even with 256, 256, etc., it seems to be difficult with 8GB of VRAM (because parameters and optimizers require a certain amount of memory regardless of resolution).
+*There was also a report that learning batch size 1 worked with 12GB VRAM and 640,640.
+The result of bucketing is displayed as follows.
+![bucketing result](https://user-images.githubusercontent.com/52813779/208911419-71c00fbb-2ce6-49d5-89b5-b78d7715e441.png)
+If you have multiple teacher data folders, please specify the full_path argument. Then run it for each folder.
+```
+python prepare_buckets_latents.py --full_path
+     train_data1 meta_clean.json meta_lat1.json model.ckpt
+     --batch_size 4 --max_resolution 512,512 --mixed_precision no
+python prepare_buckets_latents.py --full_path
+     train_data2 meta_lat1.json meta_lat2.json model.ckpt
+     --batch_size 4 --max_resolution 512,512 --mixed_precision no
+```
+It is possible to make the read source and write destination the same, but separate is safer.
+__*It is safe to rewrite the argument each time and write it to a separate metadata file. __
+## Run training
+For example: Below are the settings for saving memory.
+```
+accelerate launch --num_cpu_threads_per_process 8 fine_tune.py
+     --pretrained_model_name_or_path=model.ckpt
+     --in_json meta_lat.json
+     --train_data_dir=train_data
+     --output_dir=fine_tuned
+     --shuffle_caption
+     --train_batch_size=1 --learning_rate=5e-6 --max_train_steps=10000
+     --use_8bit_adam --xformers --gradient_checkpointing
+     --mixed_precision=bf16
+     --save_every_n_epochs=4
+```
+It seems to be good to specify the number of CPU cores for num_cpu_threads_per_process of accelerate.
+Specify the model to be trained in pretrained_model_name_or_path (Stable Diffusion checkpoint or Diffusers model). Stable Diffusion checkpoint supports .ckpt and .safetensors (automatically determined by extension).
+Specifies the metadata file when caching latent to in_json.
+Specify the training data folder for train_data_dir and the output destination folder for the trained model for output_dir.
+If shuffle_caption is specified, captions and tags are shuffled and learned in units separated by commas (this is the method used in Waifu Diffusion v1.3).
+(You can keep some of the leading tokens fixed without shuffling. See keep_tokens for other options.)
+Specify the batch size in train_batch_size. Specify 1 or 2 for VRAM 12GB. The number that can be specified also changes depending on the resolution.
+The actual amount of data used for training is "batch size x number of steps". When increasing the batch size, the number of steps can be decreased accordingly.
+Specify the learning rate in learning_rate. For example Waifu Diffusion v1.3 seems to be 5e-6.
+Specify the number of steps in max_train_steps.
+Specify use_8bit_adam to use the 8-bit Adam Optimizer. It saves memory and speeds up, but accuracy may decrease.
+Specifying xformers replaces CrossAttention to save memory and speed up.
+* As of 11/9, xformers will cause an error in float32 learning, so please use bf16/fp16 or use memory-saving CrossAttention with mem_eff_attn instead (speed is inferior to xformers).
+Enable intermediate saving of gradients in gradient_checkpointing. It's slower, but uses less memory.
+Specifies whether to use mixed precision with mixed_precision. Specifying "fp16" or "bf16" saves memory, but accuracy is inferior.
+"fp16" and "bf16" use almost the same amount of memory, and it is said that bf16 has better learning results (I didn't feel much difference in the range I tried).
+If "no" is specified, it will not be used (it will be float32).
+* It seems that an error will occur when reading checkpoints learned with bf16 with Mr. AUTOMATIC1111's Web UI. This seems to be because the data type bfloat16 causes an error in the Web UI model safety checker. Save in fp16 or float32 format with the save_precision option. Or it seems to be good to store it in safetytensors format.
+Specifying save_every_n_epochs will save the model being trained every time that many epochs have passed.
+### Supports Stable Diffusion 2.0
+Specify the --v2 option when using Hugging Face's stable-diffusion-2-base, and specify both --v2 and --v_parameterization options when using stable-diffusion-2 or 768-v-ema.ckpt please.
+### Increase accuracy and speed when memory is available
+First, removing gradient_checkpointing will speed it up. However, the batch size that can be set is reduced, so please set while looking at the balance between accuracy and speed.
+Increasing the batch size increases speed and accuracy. Increase the speed while checking the speed per data within the range where the memory is sufficient (the speed may actually decrease when the memory is at the limit).
+### Change CLIP output used
+Specifying 2 for the clip_skip option uses the output of the next-to-last layer. If 1 or option is omitted, the last layer is used.
+The learned model should be able to be inferred by Automatic1111's web UI.
+*SD2.0 uses the second layer from the back by default, so please do not specify it when learning SD2.0.
+If the model being trained was originally trained to use the second layer, 2 is a good value.
+If you were using the last layer instead, the entire model would have been trained on that assumption. Therefore, if you train again using the second layer, you may need a certain number of teacher data and longer learning to obtain the desired learning result.
+### Extending Token Length
+You can learn by extending the token length by specifying 150 or 225 for max_token_length.
+The learned model should be able to be inferred by Automatic1111's web UI.
+As with clip_skip, learning with a length different from the learning state of the model may require a certain amount of teacher data and a longer learning time.
+### Save learning log
+Specify the log save destination folder in the logging_dir option. Logs in TensorBoard format are saved.
+For example, if you specify --logging_dir=logs, a logs folder will be created in your working folder, and logs will be saved in the date/time folder.
+Also, if you specify the --log_prefix option, the specified string will be added before the date and time. Use "--logging_dir=logs --log_prefix=fine_tune_style1" for identification.
+To check the log with TensorBoard, open another command prompt and enter the following in the working folder (I think tensorboard is installed when Diffusers is installed, but if it is not installed, pip install Please put it in tensorboard).
+```
+tensorboard --logdir=logs
+```
+### Learning Hypernetworks
+It will be explained in another article.
+### Learning with fp16 gradient (experimental feature)
+The full_fp16 option will change the gradient from normal float32 to float16 (fp16) and learn (it seems to be full fp16 learning instead of mixed precision). As a result, it seems that the SD1.x 512*512 size can be learned with a VRAM usage of less than 8GB, and the SD2.x 512*512 size can be learned with a VRAM usage of less than 12GB.
+Specify fp16 in advance in accelerate config and optionally set mixed_precision="fp16" (does not work with bf16).
+To minimize memory usage, use the xformers, use_8bit_adam, gradient_checkpointing options and set train_batch_size to 1.
+(If you can afford it, increasing the train_batch_size step by step should improve the accuracy a little.)
+It is realized by patching the PyTorch source (confirmed with PyTorch 1.12.1 and 1.13.0). The accuracy will drop considerably, and the probability of learning failure on the way will also increase. The setting of the learning rate and the number of steps seems to be severe. Please be aware of them and use them at your own risk.
+### Other Options
+#### keep_tokens
+If a number is specified, the specified number of tokens (comma-separated strings) from the beginning of the caption are fixed without being shuffled.
+If there are both captions and tags, the prompts during learning will be concatenated like "caption, tag 1, tag 2...", so if you set "--keep_tokens=1", the caption will always be at the beginning during learning. will come.
+#### dataset_repeats
+If the number of data sets is extremely small, the epoch will end soon (it will take some time at the epoch break), so please specify a numerical value and multiply the data by some to make the epoch longer.
+#### train_text_encoder
+Text Encoder is also a learning target. Slightly increased memory usage.
+In normal fine tuning, the Text Encoder is not targeted for training (probably because U-Net is trained to follow the output of the Text Encoder), but if the number of training data is small, the Text Encoder is trained like DreamBooth. also seems to be valid.
+#### save_precision
+The data format when saving checkpoints can be specified from float, fp16, and bf16 (if not specified, it is the same as the data format during learning). It saves disk space, but the model produces different results. Also, if you specify float or fp16, you should be able to read it on Mr. 1111's Web UI.
+*For VAE, the data format of the original checkpoint will remain, so the model size may not be reduced to a little over 2GB even with fp16.
+#### save_model_as
+Specify the save format of the model. Specify one of ckpt, safetensors, diffusers, diffusers_safetensors.
+When reading Stable Diffusion format (ckpt or safetensors) and saving in Diffusers format, missing information is supplemented by dropping v1.5 or v2.1 information from Hugging Face.
+#### use_safetensors
+This option saves checkpoints in safetyensors format. The save format will be the default (same format as loaded).
+#### save_state and resume
+The save_state option saves the learning state of the optimizer, etc. in addition to the checkpoint in the folder when saving midway and at the final save. This avoids a decrease in accuracy when learning is resumed after being interrupted (since the optimizer optimizes while having a state, if the state is reset, the optimization must be performed again from the initial state. not). Note that the number of steps is not saved due to Accelerate specifications.
+When starting the script, you can resume by specifying the folder where the state is saved with the resume option.
+Please note that the learning state will be about 5 GB per save, so please be careful of the disk capacity.
+#### gradient_accumulation_steps
+Updates the gradient in batches for the specified number of steps. Has a similar effect to increasing the batch size, but consumes slightly more memory.
+*The Accelerate specification does not support multiple learning models, so if you set Text Encoder as the learning target and specify a value of 2 or more for this option, an error may occur.
+#### lr_scheduler / lr_warmup_steps
+You can choose the learning rate scheduler from linear, cosine, cosine_with_restarts, polynomial, constant, constant_with_warmup with the lr_scheduler option. Default is constant.
+With lr_warmup_steps, you can specify the number of steps to warm up the scheduler (gradually changing the learning rate). Please do your own research for details.
+#### diffusers_xformers
+Uses Diffusers' xformers feature rather than the script's own xformers replacement feature. Hypernetwork learning is no longer possible.

fine_tune_README_ja.md ADDED Viewed

	@@ -0,0 +1,140 @@

+NovelAIの提案した学習手法、自動キャプションニング、タグ付け、Windows＋VRAM 12GB（SD v1.xの場合）環境等に対応したfine tuningです。ここでfine tuningとは、モデルを画像とキャプションで学習することを指します（LoRAやTextual Inversion、Hypernetworksは含みません）
+[学習についての共通ドキュメント](./train_README-ja.md) もあわせてご覧ください。
+# 概要
+Diffusersを用いてStable DiffusionのU-Netのfine tuningを行います。NovelAIの記事にある以下の改善に対応しています（Aspect Ratio BucketingについてはNovelAIのコードを参考にしましたが、最終的なコードはすべてオリジナルです）。
+* CLIP（Text Encoder）の最後の層ではなく最後から二番目の層の出力を用いる。
+* 正方形以外の解像度での学習（Aspect Ratio Bucketing） 。
+* トークン長を75から225に拡張する。
+* BLIPによるキャプショニング（キャプションの自動作成）、DeepDanbooruまたはWD14Taggerによる自動タグ付けを行う。
+* Hypernetworkの学習にも対応する。
+* Stable Diffusion v2.0（baseおよび768/v）に対応。
+* VAEの出力をあらかじめ取得しディスクに保存しておくことで、学習の省メモリ化、高速化を図る。
+デフォルトではText Encoderの学習は行いません。モデル全体のfine tuningではU-Netだけを学習するのが一般的なようです（NovelAIもそのようです）。オプション指定でText Encoderも学習対象とできます。
+# 追加機能について
+## CLIPの出力の変更
+プロンプトを画像に反映するため、テキストの特徴量への変換を行うのがCLIP（Text Encoder）です。Stable DiffusionではCLIPの最後の層の出力を用いていますが、それを最後から二番目の層の出力を用いるよう変更できます。NovelAIによると、これによりより正確にプロンプトが反映されるようになるとのことです。
+元のまま、最後の層の出力を用いることも可能です。
+※Stable Diffusion 2.0では最後から二番目の層をデフォルトで使います。clip_skipオプションを指定しないでください。
+## 正方形以外の解像度での学習
+Stable Diffusionは512\*512で学習されていますが、それに加えて256\*1024や384\*640といった解像度でも学習します。これによりトリミングされる部分が減り、より正しくプロンプトと画像の関係が学習されることが期待されます。
+学習解像度はパラメータとして与えられた解像度の面積（＝メモリ使用量）を超えない範囲で、64ピクセル単位で縦横に調整、作成されます。
+機械学習では入力サイズをすべて統一するのが一般的ですが、特に制約があるわけではなく、実際は同一のバッチ内で統一されていれば大丈夫です。NovelAIの言うbucketingは、あらかじめ教師データを、アスペクト比に応じた学習解像度ごとに分類しておくことを指しているようです。そしてバッチを各bucket内の画像で作成することで、バッチの画像サイズを統一します。
+## トークン長の75から225への拡張
+Stable Diffusionでは最大75トークン（開始・終了を含むと77トークン）ですが、それを225トークンまで拡張します。
+ただしCLIPが受け付ける最大長は75トークンですので、225トークンの場合、単純に三分割してCLIPを呼び出してから結果を連結しています。
+※これが望ましい実装なのかどうかはいまひとつわかりません。とりあえず動いてはいるようです。特に2.0では何も参考になる実装がないので独自に実装してあります。
+※Automatic1111氏のWeb UIではカンマを意識して分割、といったこともしているようですが、私の場合はそこまでしておらず単純な分割です。
+# 学習の手順
+あらかじめこのリポジトリのREADMEを参照し、環境整備を行ってください。
+## データの準備
+[学習データの準備について](./train_README-ja.md) を参照してください。fine tuningではメタデータを用いるfine tuning方式のみ対応しています。
+## 学習の実行
+たとえば以下のように実行します。以下は省メモリ化のための設定です。それぞれの行を必要に応じて書き換えてください。
+```
+accelerate launch --num_cpu_threads_per_process 1 fine_tune.py
+    --pretrained_model_name_or_path=<.ckptまたは.safetensordまたはDiffusers版モデルのディレクトリ>
+    --output_dir=<学習したモデルの出力先フォルダ>
+    --output_name=<学習したモデル出力時のファイル名>
+    --dataset_config=<データ準備で作成した.tomlファイル>
+    --save_model_as=safetensors
+    --learning_rate=5e-6 --max_train_steps=10000
+    --use_8bit_adam --xformers --gradient_checkpointing
+    --mixed_precision=fp16
+```
+`num_cpu_threads_per_process` には通常は1を指定するとよいようです。
+`pretrained_model_name_or_path` に追加学習を行う元となるモデルを指定します。Stable Diffusionのcheckpointファイル（.ckptまたは.safetensors）、Diffusersのローカルディスクにあるモデルディレクトリ、DiffusersのモデルID（"stabilityai/stable-diffusion-2"など）が指定できます。
+`output_dir` に学習後のモデルを保存するフォルダを指定します。`output_name` にモデルのファイル名を拡張子を除いて指定します。`save_model_as` でsafetensors形式での保存を指定しています。
+`dataset_config` に `.toml` ファイルを指定します。ファイル内でのバッチサイズ指定は、当初はメモリ消費を抑えるために `1` としてください。
+学習させるステップ数 `max_train_steps` を10000とします。学習率 `learning_rate` はここでは5e-6を指定しています。
+省メモリ化のため `mixed_precision="fp16"` を指定します（RTX30 シリーズ以降では `bf16` も指定できます。環境整備時にaccelerateに行った設定と合わせてください）。また `gradient_checkpointing` を指定します。
+オプティマイザ（モデルを学習データにあうように最適化＝学習させるクラス）にメモリ消費の少ない 8bit AdamW を使うため、 `optimizer_type="AdamW8bit"` を指定します。
+`xformers` オプションを指定し、xformersのCrossAttentionを用います。xformersをインストールしていない場合やエラーとなる場合（環境にもよりますが `mixed_precision="no"` の場合など）、代わりに `mem_eff_attn` オプションを指定すると省メモリ版CrossAttentionを使用します（速度は遅くなります）。
+ある程度メモリがある場合は、`.toml` ファイルを編集してバッチサイズをたとえば `4` くらいに増やしてください（高速化と精度向上の可能性があります）。
+### よく使われるオプションについて
+以下の場合にはオプションに関するドキュメントを参照してください。
+- Stable Diffusion 2.xまたはそこからの派生モデルを学習する
+- clip skipを2以上を前提としたモデルを学習する
+- 75トークンを超えたキャプションで学習する
+### バッチサイズについて
+モデル全体を学習するためLoRA等の学習に比べるとメモリ消費量は多くなります（DreamBoothと同じ）。
+### 学習率について
+1e-6から5e-6程度が一般的なようです。他のfine tuningの例なども参照してみてください。
+### 以前の形式のデータセット指定をした場合のコマンドライン
+解像度やバッチサイズをオプションで指定します。コマンドラインの例は以下の通りです。
+```
+accelerate launch --num_cpu_threads_per_process 1 fine_tune.py
+    --pretrained_model_name_or_path=model.ckpt
+    --in_json meta_lat.json
+    --train_data_dir=train_data
+    --output_dir=fine_tuned
+    --shuffle_caption
+    --train_batch_size=1 --learning_rate=5e-6 --max_train_steps=10000
+    --use_8bit_adam --xformers --gradient_checkpointing
+    --mixed_precision=bf16
+    --save_every_n_epochs=4
+```
+<!--
+### 勾配をfp16とした学習（実験的機能）
+full_fp16オプションを指定すると勾配を通常のfloat32からfloat16（fp16）に変更して学習します（mixed precisionではなく完全なfp16学習になるようです）。これによりSD1.xの512*512サイズでは8GB未満、SD2.xの512*512サイズで12GB未満のVRAM使用量で学習できるようです。
+あらかじめaccelerate configでfp16を指定し、オプションでmixed_precision="fp16"としてください（bf16では動作しません）。
+メモリ使用量を最小化するためには、xformers、use_8bit_adam、gradient_checkpointingの各オプションを指定し、train_batch_sizeを1としてください。
+（余裕があるようならtrain_batch_sizeを段階的に増やすと若干精度が上がるはずです。）
+PyTorchのソースにパッチを当てて無理やり実現しています（PyTorch 1.12.1と1.13.0で確認）。精度はかなり落ちますし、途中で学習失敗する確率も高くなります。学習率やステップ数の設定もシビアなようです。それらを認識したうえで自己責任でお使いください。
+-->
+# fine tuning特有のその他の主なオプション
+すべてのオプションについては別文書を参照してください。
+## `train_text_encoder`
+Text Encoderも学習対象とします。メモリ使用量が若干増加します。
+通常のfine tuningではText Encoderは学習対象としませんが（恐らくText Encoderの出力に従うようにU-Netを学習するため）、学習データ数が少ない場合には、DreamBoothのようにText Encoder側に学習させるのも有効的なようです。
+## `diffusers_xformers`
+スクリプト独自のxformers置換機能ではなくDiffusersのxformers機能を利用します。Hypernetworkの学習はできなくなります。

finetune/blip/blip.py ADDED Viewed

	@@ -0,0 +1,240 @@

+'''
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+'''
+import warnings
+warnings.filterwarnings("ignore")
+# from models.vit import VisionTransformer, interpolate_pos_embed
+# from models.med import BertConfig, BertModel, BertLMHeadModel
+from blip.vit import VisionTransformer, interpolate_pos_embed
+from blip.med import BertConfig, BertModel, BertLMHeadModel
+from transformers import BertTokenizer
+import torch
+from torch import nn
+import torch.nn.functional as F
+import os
+from urllib.parse import urlparse
+from timm.models.hub import download_cached_file
+class BLIP_Base(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/med_config.json',
+                 image_size = 224,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
+        self.tokenizer = init_tokenizer()
+        med_config = BertConfig.from_json_file(med_config)
+        med_config.encoder_width = vision_width
+        self.text_encoder = BertModel(config=med_config, add_pooling_layer=False)
+    def forward(self, image, caption, mode):
+        assert mode in ['image', 'text', 'multimodal'], "mode parameter must be image, text, or multimodal"
+        text = self.tokenizer(caption, return_tensors="pt").to(image.device)
+        if mode=='image':
+            # return image features
+            image_embeds = self.visual_encoder(image)
+            return image_embeds
+        elif mode=='text':
+            # return text features
+            text_output = self.text_encoder(text.input_ids, attention_mask = text.attention_mask,
+                                            return_dict = True, mode = 'text')
+            return text_output.last_hidden_state
+        elif mode=='multimodal':
+            # return multimodel features
+            image_embeds = self.visual_encoder(image)
+            image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+            text.input_ids[:,0] = self.tokenizer.enc_token_id
+            output = self.text_encoder(text.input_ids,
+                                       attention_mask = text.attention_mask,
+                                       encoder_hidden_states = image_embeds,
+                                       encoder_attention_mask = image_atts,
+                                       return_dict = True,
+                                      )
+            return output.last_hidden_state
+class BLIP_Decoder(nn.Module):
+    def __init__(self,
+                 med_config = 'configs/med_config.json',
+                 image_size = 384,
+                 vit = 'base',
+                 vit_grad_ckpt = False,
+                 vit_ckpt_layer = 0,
+                 prompt = 'a picture of ',
+                 ):
+        """
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+        """
+        super().__init__()
+        self.visual_encoder, vision_width = create_vit(vit,image_size, vit_grad_ckpt, vit_ckpt_layer)
+        self.tokenizer = init_tokenizer()
+        med_config = BertConfig.from_json_file(med_config)
+        med_config.encoder_width = vision_width
+        self.text_decoder = BertLMHeadModel(config=med_config)
+        self.prompt = prompt
+        self.prompt_length = len(self.tokenizer(self.prompt).input_ids)-1
+    def forward(self, image, caption):
+        image_embeds = self.visual_encoder(image)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        text = self.tokenizer(caption, padding='longest', truncation=True, max_length=40, return_tensors="pt").to(image.device)
+        text.input_ids[:,0] = self.tokenizer.bos_token_id
+        decoder_targets = text.input_ids.masked_fill(text.input_ids == self.tokenizer.pad_token_id, -100)
+        decoder_targets[:,:self.prompt_length] = -100
+        decoder_output = self.text_decoder(text.input_ids,
+                                           attention_mask = text.attention_mask,
+                                           encoder_hidden_states = image_embeds,
+                                           encoder_attention_mask = image_atts,
+                                           labels = decoder_targets,
+                                           return_dict = True,
+                                          )
+        loss_lm = decoder_output.loss
+        return loss_lm
+    def generate(self, image, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0):
+        image_embeds = self.visual_encoder(image)
+        if not sample:
+            image_embeds = image_embeds.repeat_interleave(num_beams,dim=0)
+        image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(image.device)
+        model_kwargs = {"encoder_hidden_states": image_embeds, "encoder_attention_mask":image_atts}
+        prompt = [self.prompt] * image.size(0)
+        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(image.device)
+        input_ids[:,0] = self.tokenizer.bos_token_id
+        input_ids = input_ids[:, :-1]
+        if sample:
+            #nucleus sampling
+            outputs = self.text_decoder.generate(input_ids=input_ids,
+                                                  max_length=max_length,
+                                                  min_length=min_length,
+                                                  do_sample=True,
+                                                  top_p=top_p,
+                                                  num_return_sequences=1,
+                                                  eos_token_id=self.tokenizer.sep_token_id,
+                                                  pad_token_id=self.tokenizer.pad_token_id,
+                                                  repetition_penalty=1.1,
+                                                  **model_kwargs)
+        else:
+            #beam search
+            outputs = self.text_decoder.generate(input_ids=input_ids,
+                                                  max_length=max_length,
+                                                  min_length=min_length,
+                                                  num_beams=num_beams,
+                                                  eos_token_id=self.tokenizer.sep_token_id,
+                                                  pad_token_id=self.tokenizer.pad_token_id,
+                                                  repetition_penalty=repetition_penalty,
+                                                  **model_kwargs)
+        captions = []
+        for output in outputs:
+            caption = self.tokenizer.decode(output, skip_special_tokens=True)
+            captions.append(caption[len(self.prompt):])
+        return captions
+def blip_decoder(pretrained='',**kwargs):
+    model = BLIP_Decoder(**kwargs)
+    if pretrained:
+        model,msg = load_checkpoint(model,pretrained)
+        assert(len(msg.missing_keys)==0)
+    return model
+def blip_feature_extractor(pretrained='',**kwargs):
+    model = BLIP_Base(**kwargs)
+    if pretrained:
+        model,msg = load_checkpoint(model,pretrained)
+        assert(len(msg.missing_keys)==0)
+    return model
+def init_tokenizer():
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    tokenizer.add_special_tokens({'bos_token':'[DEC]'})
+    tokenizer.add_special_tokens({'additional_special_tokens':['[ENC]']})
+    tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
+    return tokenizer
+def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0):
+    assert vit in ['base', 'large'], "vit parameter must be base or large"
+    if vit=='base':
+        vision_width = 768
+        visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=12,
+                                           num_heads=12, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
+                                           drop_path_rate=0 or drop_path_rate
+                                          )
+    elif vit=='large':
+        vision_width = 1024
+        visual_encoder = VisionTransformer(img_size=image_size, patch_size=16, embed_dim=vision_width, depth=24,
+                                           num_heads=16, use_grad_checkpointing=use_grad_checkpointing, ckpt_layer=ckpt_layer,
+                                           drop_path_rate=0.1 or drop_path_rate
+                                          )
+    return visual_encoder, vision_width
+def is_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+def load_checkpoint(model,url_or_filename):
+    if is_url(url_or_filename):
+        cached_file = download_cached_file(url_or_filename, check_hash=False, progress=True)
+        checkpoint = torch.load(cached_file, map_location='cpu')
+    elif os.path.isfile(url_or_filename):
+        checkpoint = torch.load(url_or_filename, map_location='cpu')
+    else:
+        raise RuntimeError('checkpoint url or path is invalid')
+    state_dict = checkpoint['model']
+    state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder.pos_embed'],model.visual_encoder)
+    if 'visual_encoder_m.pos_embed' in model.state_dict().keys():
+        state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed(state_dict['visual_encoder_m.pos_embed'],
+                                                                         model.visual_encoder_m)
+    for key in model.state_dict().keys():
+        if key in state_dict.keys():
+            if state_dict[key].shape!=model.state_dict()[key].shape:
+                del state_dict[key]
+    msg = model.load_state_dict(state_dict,strict=False)
+    print('load checkpoint from %s'%url_or_filename)
+    return model,msg

finetune/blip/med.py ADDED Viewed

	@@ -0,0 +1,955 @@

+'''
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+'''
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+logger = logging.get_logger(__name__)
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.config = config
+    def forward(
+        self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        embeddings = inputs_embeds
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        past_key_value = (key_layer, value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        outputs = outputs + (past_key_value,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if self.config.add_cross_attention:
+            self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        mode=None,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+        if mode=='multimodal':
+            assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+        outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        mode='multimodal',
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        next_decoder_cache = () if use_cache else None
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    mode=mode,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    mode=mode,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multimodal',
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif encoder_embeds is not None:
+            input_shape = encoder_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = encoder_embeds.device
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape,
+                                                                                 device, is_decoder)
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            mode=mode,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class BertLMHeadModel(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction='mean',
+        mode='multimodal',
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            if reduction=='none':
+                lm_loss = lm_loss.view(prediction_scores.size(0),-1).sum(1)
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past

finetune/blip/med_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "architectures": [
+      "BertModel"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "model_type": "bert",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 0,
+    "type_vocab_size": 2,
+    "vocab_size": 30524,
+    "encoder_width": 768,
+    "add_cross_attention": true
+  }

finetune/blip/vit.py ADDED Viewed

	@@ -0,0 +1,305 @@

+'''
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on timm code base
+ * https://github.com/rwightman/pytorch-image-models/tree/master/timm
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from timm.models.vision_transformer import _cfg, PatchEmbed
+from timm.models.registry import register_model
+from timm.models.layers import trunc_normal_, DropPath
+from timm.models.helpers import named_apply, adapt_input_conv
+from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_gradients = None
+        self.attention_map = None
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def forward(self, x, register_hook=False):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        if register_hook:
+            self.save_attention_map(attn)
+            attn.register_hook(self.save_attn_gradients)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_grad_checkpointing=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if use_grad_checkpointing:
+            self.attn = checkpoint_wrapper(self.attn)
+            self.mlp = checkpoint_wrapper(self.mlp)
+    def forward(self, x, register_hook=False):
+        x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class VisionTransformer(nn.Module):
+    """ Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`  -
+        https://arxiv.org/abs/2010.11929
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=None,
+                 use_grad_checkpointing=False, ckpt_layer=0):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer: (nn.Module): normalization layer
+        """
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                use_grad_checkpointing=(use_grad_checkpointing and i>=depth-ckpt_layer)
+            )
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+    def forward(self, x, register_blk=-1):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed[:,:x.size(1),:]
+        x = self.pos_drop(x)
+        for i,blk in enumerate(self.blocks):
+            x = blk(x, register_blk==i)
+        x = self.norm(x)
+        return x
+    @torch.jit.ignore()
+    def load_pretrained(self, checkpoint_path, prefix=''):
+        _load_weights(self, checkpoint_path, prefix)
+@torch.no_grad()
+def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''):
+    """ Load weights from .npz checkpoints for official Google Brain Flax implementation
+    """
+    import numpy as np
+    def _n2p(w, t=True):
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+    w = np.load(checkpoint_path)
+    if not prefix and 'opt/target/embedding/kernel' in w:
+        prefix = 'opt/target/'
+    if hasattr(model.patch_embed, 'backbone'):
+        # hybrid
+        backbone = model.patch_embed.backbone
+        stem_only = not hasattr(backbone, 'stem')
+        stem = backbone if stem_only else backbone.stem
+        stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel'])))
+        stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
+        stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
+        if not stem_only:
+            for i, stage in enumerate(backbone.stages):
+                for j, block in enumerate(stage.blocks):
+                    bp = f'{prefix}block{i + 1}/unit{j + 1}/'
+                    for r in range(3):
+                        getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel']))
+                        getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale']))
+                        getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias']))
+                    if block.downsample is not None:
+                        block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel']))
+                        block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale']))
+                        block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias']))
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+    else:
+        embed_conv_w = adapt_input_conv(
+            model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel']))
+    model.patch_embed.proj.weight.copy_(embed_conv_w)
+    model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+    model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+    pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
+    if pos_embed_w.shape != model.pos_embed.shape:
+        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
+            pos_embed_w, model.pos_embed, getattr(model, 'num_tokens', 1), model.patch_embed.grid_size)
+    model.pos_embed.copy_(pos_embed_w)
+    model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+    model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+#     if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
+#         model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+#         model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+#     if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
+#         model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
+#         model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
+    for i, block in enumerate(model.blocks.children()):
+        block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+        mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
+        block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+        block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+        block.attn.qkv.weight.copy_(torch.cat([
+            _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')]))
+        block.attn.qkv.bias.copy_(torch.cat([
+            _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')]))
+        block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+        block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+        for r in range(2):
+            getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
+            getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
+        block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
+        block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))
+def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder):
+    # interpolate position embedding
+    embedding_size = pos_embed_checkpoint.shape[-1]
+    num_patches = visual_encoder.patch_embed.num_patches
+    num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches
+    # height (== width) for the checkpoint position embedding
+    orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+    # height (== width) for the new position embedding
+    new_size = int(num_patches ** 0.5)
+    if orig_size!=new_size:
+        # class_token and dist_token are kept unchanged
+        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+        # only the position tokens are interpolated
+        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+        pos_tokens = torch.nn.functional.interpolate(
+            pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+        print('reshape position embedding from %d to %d'%(orig_size ** 2,new_size ** 2))
+        return new_pos_embed
+    else:
+        return pos_embed_checkpoint

finetune/clean_captions_and_tags.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# このスクリプトのライセンスは、Apache License 2.0とします
+# (c) 2022 Kohya S. @kohya_ss
+import argparse
+import glob
+import os
+import json
+import re
+from tqdm import tqdm
+PATTERN_HAIR_LENGTH = re.compile(r', (long|short|medium) hair, ')
+PATTERN_HAIR_CUT = re.compile(r', (bob|hime) cut, ')
+PATTERN_HAIR = re.compile(r', ([\w\-]+) hair, ')
+PATTERN_WORD = re.compile(r', ([\w\-]+|hair ornament), ')
+# 複数人がいるとき、複数の髪色や目の色が定義されていれば削除する
+PATTERNS_REMOVE_IN_MULTI = [
+    PATTERN_HAIR_LENGTH,
+    PATTERN_HAIR_CUT,
+    re.compile(r', [\w\-]+ eyes, '),
+    re.compile(r', ([\w\-]+ sleeves|sleeveless), '),
+    # 複数の髪型定義がある場合は削除する
+    re.compile(
+        r', (ponytail|braid|ahoge|twintails|[\w\-]+ bun|single hair bun|single side bun|two side up|two tails|[\w\-]+ braid|sidelocks), '),
+]
+def clean_tags(image_key, tags):
+  # replace '_' to ' '
+  tags = tags.replace('^_^', '^@@@^')
+  tags = tags.replace('_', ' ')
+  tags = tags.replace('^@@@^', '^_^')
+  # remove rating: deepdanbooruのみ
+  tokens = tags.split(", rating")
+  if len(tokens) == 1:
+    # WD14 taggerのときはこちらになるのでメッセージは出さない
+    # print("no rating:")
+    # print(f"{image_key} {tags}")
+    pass
+  else:
+    if len(tokens) > 2:
+      print("multiple ratings:")
+      print(f"{image_key} {tags}")
+    tags = tokens[0]
+  tags = ", " + tags.replace(", ", ", , ") + ", "     # カンマ付きで検索をするための身も蓋もない対策
+  # 複数の人物がいる場合は髪色等のタグを削除する
+  if 'girls' in tags or 'boys' in tags:
+    for pat in PATTERNS_REMOVE_IN_MULTI:
+      found = pat.findall(tags)
+      if len(found) > 1:                        # 二つ以上、タグがある
+        tags = pat.sub("", tags)
+    # 髪の特殊対応
+    srch_hair_len = PATTERN_HAIR_LENGTH.search(tags)   # 髪の長さタグは例外なので避けておく（全員が同じ髪の長さの場合）
+    if srch_hair_len:
+      org = srch_hair_len.group()
+      tags = PATTERN_HAIR_LENGTH.sub(", @@@, ", tags)
+    found = PATTERN_HAIR.findall(tags)
+    if len(found) > 1:
+      tags = PATTERN_HAIR.sub("", tags)
+    if srch_hair_len:
+      tags = tags.replace(", @@@, ", org)                   # 戻す
+  # white shirtとshirtみたいな重複タグの削除
+  found = PATTERN_WORD.findall(tags)
+  for word in found:
+    if re.search(f", ((\w+) )+{word}, ", tags):
+      tags = tags.replace(f", {word}, ", "")
+  tags = tags.replace(", , ", ", ")
+  assert tags.startswith(", ") and tags.endswith(", ")
+  tags = tags[2:-2]
+  return tags
+# 上から順に検索、置換される
+# ('置換元文字列', '置換後文字列')
+CAPTION_REPLACEMENTS = [
+    ('anime anime', 'anime'),
+    ('young ', ''),
+    ('anime girl', 'girl'),
+    ('cartoon female', 'girl'),
+    ('cartoon lady', 'girl'),
+    ('cartoon character', 'girl'),      # a or ~s
+    ('cartoon woman', 'girl'),
+    ('cartoon women', 'girls'),
+    ('cartoon girl', 'girl'),
+    ('anime female', 'girl'),
+    ('anime lady', 'girl'),
+    ('anime character', 'girl'),      # a or ~s
+    ('anime woman', 'girl'),
+    ('anime women', 'girls'),
+    ('lady', 'girl'),
+    ('female', 'girl'),
+    ('woman', 'girl'),
+    ('women', 'girls'),
+    ('people', 'girls'),
+    ('person', 'girl'),
+    ('a cartoon figure', 'a figure'),
+    ('a cartoon image', 'an image'),
+    ('a cartoon picture', 'a picture'),
+    ('an anime cartoon image', 'an image'),
+    ('a cartoon anime drawing', 'a drawing'),
+    ('a cartoon drawing', 'a drawing'),
+    ('girl girl', 'girl'),
+]
+def clean_caption(caption):
+  for rf, rt in CAPTION_REPLACEMENTS:
+    replaced = True
+    while replaced:
+      bef = caption
+      caption = caption.replace(rf, rt)
+      replaced = bef != caption
+  return caption
+def main(args):
+  if os.path.exists(args.in_json):
+    print(f"loading existing metadata: {args.in_json}")
+    with open(args.in_json, "rt", encoding='utf-8') as f:
+      metadata = json.load(f)
+  else:
+    print("no metadata / メタデータファイルがありません")
+    return
+  print("cleaning captions and tags.")
+  image_keys = list(metadata.keys())
+  for image_key in tqdm(image_keys):
+    tags = metadata[image_key].get('tags')
+    if tags is None:
+      print(f"image does not have tags / メタデータにタグがありません: {image_key}")
+    else:
+      org = tags
+      tags = clean_tags(image_key, tags)
+      metadata[image_key]['tags'] = tags
+      if args.debug and org != tags:
+        print("FROM: " + org)
+        print("TO:   " + tags)
+    caption = metadata[image_key].get('caption')
+    if caption is None:
+      print(f"image does not have caption / メタデータにキャプションがありません: {image_key}")
+    else:
+      org = caption
+      caption = clean_caption(caption)
+      metadata[image_key]['caption'] = caption
+      if args.debug and org != caption:
+        print("FROM: " + org)
+        print("TO:   " + caption)
+  # metadataを書き出して終わり
+  print(f"writing metadata: {args.out_json}")
+  with open(args.out_json, "wt", encoding='utf-8') as f:
+    json.dump(metadata, f, indent=2)
+  print("done!")
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  # parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("in_json", type=str, help="metadata file to input / 読み込むメタデータファイル")
+  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+  parser.add_argument("--debug", action="store_true", help="debug mode")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args, unknown = parser.parse_known_args()
+  if len(unknown) == 1:
+    print("WARNING: train_data_dir argument is removed. This script will not work with three arguments in future. Please specify two arguments: in_json and out_json.")
+    print("All captions and tags in the metadata are processed.")
+    print("警告: train_data_dir引数は不要になりました。将来的には三つの引数を指定すると動かなくなる予定です。読み込み元のメタデータと書き出し先の二つの引数だけ指定してください。")
+    print("メタデータ内のすべてのキャプションとタグが処理されます。")
+    args.in_json = args.out_json
+    args.out_json = unknown[0]
+  elif len(unknown) > 0:
+    raise ValueError(f"error: unrecognized arguments: {unknown}")
+  main(args)

finetune/hypernetwork_nai.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# NAI compatible
+import torch
+class HypernetworkModule(torch.nn.Module):
+  def __init__(self, dim, multiplier=1.0):
+    super().__init__()
+    linear1 = torch.nn.Linear(dim, dim * 2)
+    linear2 = torch.nn.Linear(dim * 2, dim)
+    linear1.weight.data.normal_(mean=0.0, std=0.01)
+    linear1.bias.data.zero_()
+    linear2.weight.data.normal_(mean=0.0, std=0.01)
+    linear2.bias.data.zero_()
+    linears = [linear1, linear2]
+    self.linear = torch.nn.Sequential(*linears)
+    self.multiplier = multiplier
+  def forward(self, x):
+    return x + self.linear(x) * self.multiplier
+class Hypernetwork(torch.nn.Module):
+  enable_sizes = [320, 640, 768, 1280]
+  # return self.modules[Hypernetwork.enable_sizes.index(size)]
+  def __init__(self, multiplier=1.0) -> None:
+    super().__init__()
+    self.modules = []
+    for size in Hypernetwork.enable_sizes:
+      self.modules.append((HypernetworkModule(size, multiplier), HypernetworkModule(size, multiplier)))
+      self.register_module(f"{size}_0", self.modules[-1][0])
+      self.register_module(f"{size}_1", self.modules[-1][1])
+  def apply_to_stable_diffusion(self, text_encoder, vae, unet):
+    blocks = unet.input_blocks + [unet.middle_block] + unet.output_blocks
+    for block in blocks:
+      for subblk in block:
+        if 'SpatialTransformer' in str(type(subblk)):
+          for tf_block in subblk.transformer_blocks:
+            for attn in [tf_block.attn1, tf_block.attn2]:
+              size = attn.context_dim
+              if size in Hypernetwork.enable_sizes:
+                attn.hypernetwork = self
+              else:
+                attn.hypernetwork = None
+  def apply_to_diffusers(self, text_encoder, vae, unet):
+    blocks = unet.down_blocks + [unet.mid_block] + unet.up_blocks
+    for block in blocks:
+      if hasattr(block, 'attentions'):
+        for subblk in block.attentions:
+          if 'SpatialTransformer' in str(type(subblk)) or 'Transformer2DModel' in str(type(subblk)):      # 0.6.0 and 0.7~
+            for tf_block in subblk.transformer_blocks:
+              for attn in [tf_block.attn1, tf_block.attn2]:
+                size = attn.to_k.in_features
+                if size in Hypernetwork.enable_sizes:
+                  attn.hypernetwork = self
+                else:
+                  attn.hypernetwork = None
+    return True       # TODO error checking
+  def forward(self, x, context):
+    size = context.shape[-1]
+    assert size in Hypernetwork.enable_sizes
+    module = self.modules[Hypernetwork.enable_sizes.index(size)]
+    return module[0].forward(context), module[1].forward(context)
+  def load_from_state_dict(self, state_dict):
+    # old ver to new ver
+    changes = {
+        'linear1.bias': 'linear.0.bias',
+        'linear1.weight': 'linear.0.weight',
+        'linear2.bias': 'linear.1.bias',
+        'linear2.weight': 'linear.1.weight',
+    }
+    for key_from, key_to in changes.items():
+      if key_from in state_dict:
+        state_dict[key_to] = state_dict[key_from]
+        del state_dict[key_from]
+    for size, sd in state_dict.items():
+      if type(size) == int:
+        self.modules[Hypernetwork.enable_sizes.index(size)][0].load_state_dict(sd[0], strict=True)
+        self.modules[Hypernetwork.enable_sizes.index(size)][1].load_state_dict(sd[1], strict=True)
+    return True
+  def get_state_dict(self):
+    state_dict = {}
+    for i, size in enumerate(Hypernetwork.enable_sizes):
+      sd0 = self.modules[i][0].state_dict()
+      sd1 = self.modules[i][1].state_dict()
+      state_dict[size] = [sd0, sd1]
+    return state_dict

finetune/make_captions.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import argparse
+import glob
+import os
+import json
+import random
+from PIL import Image
+from tqdm import tqdm
+import numpy as np
+import torch
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from blip.blip import blip_decoder
+import library.train_util as train_util
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+IMAGE_SIZE = 384
+# 正方形でいいのか？　という気がするがソースがそうなので
+IMAGE_TRANSFORM = transforms.Compose([
+    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE), interpolation=InterpolationMode.BICUBIC),
+    transforms.ToTensor(),
+    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+])
+# 共通化したいが微妙に処理が異なる……
+class ImageLoadingTransformDataset(torch.utils.data.Dataset):
+  def __init__(self, image_paths):
+    self.images = image_paths
+  def __len__(self):
+    return len(self.images)
+  def __getitem__(self, idx):
+    img_path = self.images[idx]
+    try:
+      image = Image.open(img_path).convert("RGB")
+      # convert to tensor temporarily so dataloader will accept it
+      tensor = IMAGE_TRANSFORM(image)
+    except Exception as e:
+      print(f"Could not load image path / 画像を読み込めません: {img_path}, error: {e}")
+      return None
+    return (tensor, img_path)
+def collate_fn_remove_corrupted(batch):
+  """Collate function that allows to remove corrupted examples in the
+  dataloader. It expects that the dataloader returns 'None' when that occurs.
+  The 'None's in the batch are removed.
+  """
+  # Filter out all the Nones (corrupted examples)
+  batch = list(filter(lambda x: x is not None, batch))
+  return batch
+def main(args):
+  # fix the seed for reproducibility
+  seed = args.seed  # + utils.get_rank()
+  torch.manual_seed(seed)
+  np.random.seed(seed)
+  random.seed(seed)
+  if not os.path.exists("blip"):
+    args.train_data_dir = os.path.abspath(args.train_data_dir)        # convert to absolute path
+    cwd = os.getcwd()
+    print('Current Working Directory is: ', cwd)
+    os.chdir('finetune')
+  print(f"load images from {args.train_data_dir}")
+  image_paths = train_util.glob_images(args.train_data_dir)
+  print(f"found {len(image_paths)} images.")
+  print(f"loading BLIP caption: {args.caption_weights}")
+  model = blip_decoder(pretrained=args.caption_weights, image_size=IMAGE_SIZE, vit='large', med_config="./blip/med_config.json")
+  model.eval()
+  model = model.to(DEVICE)
+  print("BLIP loaded")
+  # captioningする
+  def run_batch(path_imgs):
+    imgs = torch.stack([im for _, im in path_imgs]).to(DEVICE)
+    with torch.no_grad():
+      if args.beam_search:
+        captions = model.generate(imgs, sample=False, num_beams=args.num_beams,
+                                  max_length=args.max_length, min_length=args.min_length)
+      else:
+        captions = model.generate(imgs, sample=True, top_p=args.top_p, max_length=args.max_length, min_length=args.min_length)
+    for (image_path, _), caption in zip(path_imgs, captions):
+      with open(os.path.splitext(image_path)[0] + args.caption_extension, "wt", encoding='utf-8') as f:
+        f.write(caption + "\n")
+        if args.debug:
+          print(image_path, caption)
+  # 読み込みの高速化のためにDataLoaderを使うオプション
+  if args.max_data_loader_n_workers is not None:
+    dataset = ImageLoadingTransformDataset(image_paths)
+    data = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False,
+                                      num_workers=args.max_data_loader_n_workers, collate_fn=collate_fn_remove_corrupted, drop_last=False)
+  else:
+    data = [[(None, ip)] for ip in image_paths]
+  b_imgs = []
+  for data_entry in tqdm(data, smoothing=0.0):
+    for data in data_entry:
+      if data is None:
+        continue
+      img_tensor, image_path = data
+      if img_tensor is None:
+        try:
+          raw_image = Image.open(image_path)
+          if raw_image.mode != 'RGB':
+            raw_image = raw_image.convert("RGB")
+          img_tensor = IMAGE_TRANSFORM(raw_image)
+        except Exception as e:
+          print(f"Could not load image path / 画像を読み込めません: {image_path}, error: {e}")
+          continue
+      b_imgs.append((image_path, img_tensor))
+      if len(b_imgs) >= args.batch_size:
+        run_batch(b_imgs)
+        b_imgs.clear()
+  if len(b_imgs) > 0:
+    run_batch(b_imgs)
+  print("done!")
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("--caption_weights", type=str, default="https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth",
+                      help="BLIP caption weights (model_large_caption.pth) / BLIP captionの��みファイル(model_large_caption.pth)")
+  parser.add_argument("--caption_extention", type=str, default=None,
+                      help="extension of caption file (for backward compatibility) / 出力されるキャプションファイルの拡張子（スペルミスしていたのを残してあります）")
+  parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 出力されるキャプションファイルの拡張子")
+  parser.add_argument("--beam_search", action="store_true",
+                      help="use beam search (default Nucleus sampling) / beam searchを使う（このオプション未指定時はNucleus sampling）")
+  parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
+  parser.add_argument("--max_data_loader_n_workers", type=int, default=None,
+                      help="enable image reading by DataLoader with this number of workers (faster) / DataLoaderによる画像読み込みを有効にしてこのワーカー数を適用する（読み込みを高速化）")
+  parser.add_argument("--num_beams", type=int, default=1, help="num of beams in beam search /beam search時のビーム数（多いと精度が上がるが時間がかかる）")
+  parser.add_argument("--top_p", type=float, default=0.9, help="top_p in Nucleus sampling / Nucleus sampling時のtop_p")
+  parser.add_argument("--max_length", type=int, default=75, help="max length of caption / captionの最大長")
+  parser.add_argument("--min_length", type=int, default=5, help="min length of caption / captionの最小長")
+  parser.add_argument('--seed', default=42, type=int, help='seed for reproducibility / 再現性を確保するための乱数seed')
+  parser.add_argument("--debug", action="store_true", help="debug mode")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args = parser.parse_args()
+  # スペルミスしていたオプションを復元する
+  if args.caption_extention is not None:
+    args.caption_extension = args.caption_extention
+  main(args)

finetune/make_captions_by_git.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import argparse
+import os
+import re
+from PIL import Image
+from tqdm import tqdm
+import torch
+from transformers import AutoProcessor, AutoModelForCausalLM
+from transformers.generation.utils import GenerationMixin
+import library.train_util as train_util
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+PATTERN_REPLACE = [
+    re.compile(r'(has|with|and) the (words?|letters?|name) (" ?[^"]*"|\w+)( ?(is )?(on|in) (the |her |their |him )?\w+)?'),
+    re.compile(r'(with a sign )?that says ?(" ?[^"]*"|\w+)( ?on it)?'),
+    re.compile(r"(with a sign )?that says ?(' ?(i'm)?[^']*'|\w+)( ?on it)?"),
+    re.compile(r'with the number \d+ on (it|\w+ \w+)'),
+    re.compile(r'with the words "'),
+    re.compile(r'word \w+ on it'),
+    re.compile(r'that says the word \w+ on it'),
+    re.compile('that says\'the word "( on it)?'),
+]
+# 誤検知しまくりの with the word xxxx を消す
+def remove_words(captions, debug):
+  removed_caps = []
+  for caption in captions:
+    cap = caption
+    for pat in PATTERN_REPLACE:
+      cap = pat.sub("", cap)
+    if debug and cap != caption:
+      print(caption)
+      print(cap)
+    removed_caps.append(cap)
+  return removed_caps
+def collate_fn_remove_corrupted(batch):
+  """Collate function that allows to remove corrupted examples in the
+  dataloader. It expects that the dataloader returns 'None' when that occurs.
+  The 'None's in the batch are removed.
+  """
+  # Filter out all the Nones (corrupted examples)
+  batch = list(filter(lambda x: x is not None, batch))
+  return batch
+def main(args):
+  # GITにバッチサイズが1より大きくても動くようにパッチを当てる: transformers 4.26.0用
+  org_prepare_input_ids_for_generation = GenerationMixin._prepare_input_ids_for_generation
+  curr_batch_size = [args.batch_size]         # ループの最後で件数がbatch_size未満になるので入れ替えられるように
+  # input_idsがバッチサイズと同じ件数である必要がある：バッチサイズはこの関数から参照できないので外から渡す
+  # ここより上で置き換えようとするとすごく大変
+  def _prepare_input_ids_for_generation_patch(self, bos_token_id, encoder_outputs):
+    input_ids = org_prepare_input_ids_for_generation(self, bos_token_id, encoder_outputs)
+    if input_ids.size()[0] != curr_batch_size[0]:
+      input_ids = input_ids.repeat(curr_batch_size[0], 1)
+    return input_ids
+  GenerationMixin._prepare_input_ids_for_generation = _prepare_input_ids_for_generation_patch
+  print(f"load images from {args.train_data_dir}")
+  image_paths = train_util.glob_images(args.train_data_dir)
+  print(f"found {len(image_paths)} images.")
+  # できればcacheに依存せず明示的にダウンロードしたい
+  print(f"loading GIT: {args.model_id}")
+  git_processor = AutoProcessor.from_pretrained(args.model_id)
+  git_model = AutoModelForCausalLM.from_pretrained(args.model_id).to(DEVICE)
+  print("GIT loaded")
+  # captioningする
+  def run_batch(path_imgs):
+    imgs = [im for _, im in path_imgs]
+    curr_batch_size[0] = len(path_imgs)
+    inputs = git_processor(images=imgs, return_tensors="pt").to(DEVICE)           # 画像はpil形式
+    generated_ids = git_model.generate(pixel_values=inputs.pixel_values, max_length=args.max_length)
+    captions = git_processor.batch_decode(generated_ids, skip_special_tokens=True)
+    if args.remove_words:
+      captions = remove_words(captions, args.debug)
+    for (image_path, _), caption in zip(path_imgs, captions):
+      with open(os.path.splitext(image_path)[0] + args.caption_extension, "wt", encoding='utf-8') as f:
+        f.write(caption + "\n")
+        if args.debug:
+          print(image_path, caption)
+  # 読み込みの高速化のためにDataLoaderを使うオプション
+  if args.max_data_loader_n_workers is not None:
+    dataset = train_util.ImageLoadingDataset(image_paths)
+    data = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False,
+                                       num_workers=args.max_data_loader_n_workers, collate_fn=collate_fn_remove_corrupted, drop_last=False)
+  else:
+    data = [[(None, ip)] for ip in image_paths]
+  b_imgs = []
+  for data_entry in tqdm(data, smoothing=0.0):
+    for data in data_entry:
+      if data is None:
+        continue
+      image, image_path = data
+      if image is None:
+        try:
+          image = Image.open(image_path)
+          if image.mode != 'RGB':
+            image = image.convert("RGB")
+        except Exception as e:
+          print(f"Could not load image path / 画像を読み込めません: {image_path}, error: {e}")
+          continue
+      b_imgs.append((image_path, image))
+      if len(b_imgs) >= args.batch_size:
+        run_batch(b_imgs)
+        b_imgs.clear()
+  if len(b_imgs) > 0:
+    run_batch(b_imgs)
+  print("done!")
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 出力されるキャプションファイルの拡張子")
+  parser.add_argument("--model_id", type=str, default="microsoft/git-large-textcaps",
+                      help="model id for GIT in Hugging Face / 使用するGITのHugging FaceのモデルID")
+  parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
+  parser.add_argument("--max_data_loader_n_workers", type=int, default=None,
+                      help="enable image reading by DataLoader with this number of workers (faster) / DataLoaderによる画像読み込みを有効にしてこのワーカー数を適用する（読み込みを高速化）")
+  parser.add_argument("--max_length", type=int, default=50, help="max length of caption / captionの最大長")
+  parser.add_argument("--remove_words", action="store_true",
+                      help="remove like `with the words xxx` from caption / `with the words xxx`のような部分をキャプションから削除する")
+  parser.add_argument("--debug", action="store_true", help="debug mode")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args = parser.parse_args()
+  main(args)

finetune/merge_captions_to_metadata.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import argparse
+import json
+from pathlib import Path
+from typing import List
+from tqdm import tqdm
+import library.train_util as train_util
+import os
+def main(args):
+  assert not args.recursive or (args.recursive and args.full_path), "recursive requires full_path / recursiveはfull_pathと同時に指定してください"
+  train_data_dir_path = Path(args.train_data_dir)
+  image_paths: List[Path] = train_util.glob_images_pathlib(train_data_dir_path, args.recursive)
+  print(f"found {len(image_paths)} images.")
+  if args.in_json is None and Path(args.out_json).is_file():
+    args.in_json = args.out_json
+  if args.in_json is not None:
+    print(f"loading existing metadata: {args.in_json}")
+    metadata = json.loads(Path(args.in_json).read_text(encoding='utf-8'))
+    print("captions for existing images will be overwritten / 既存の画像のキャプションは上書きされます")
+  else:
+    print("new metadata will be created / 新しいメタデータファイルが作成されます")
+    metadata = {}
+  print("merge caption texts to metadata json.")
+  for image_path in tqdm(image_paths):
+    caption_path = image_path.with_suffix(args.caption_extension)
+    caption = caption_path.read_text(encoding='utf-8').strip()
+    if not os.path.exists(caption_path):
+      caption_path = os.path.join(image_path, args.caption_extension)
+    image_key = str(image_path) if args.full_path else image_path.stem
+    if image_key not in metadata:
+      metadata[image_key] = {}
+    metadata[image_key]['caption'] = caption
+    if args.debug:
+      print(image_key, caption)
+  # metadataを書き出して終わり
+  print(f"writing metadata: {args.out_json}")
+  Path(args.out_json).write_text(json.dumps(metadata, indent=2), encoding='utf-8')
+  print("done!")
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+  parser.add_argument("--in_json", type=str,
+                      help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル（省略時、out_jsonが存在すればそれを読み込む）")
+  parser.add_argument("--caption_extention", type=str, default=None,
+                      help="extension of caption file (for backward compatibility) / 読み込むキャプションファイルの拡張子（スペルミスしていたのを残してあります）")
+  parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 読み込むキャプションファイルの拡張子")
+  parser.add_argument("--full_path", action="store_true",
+                      help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）")
+  parser.add_argument("--recursive", action="store_true",
+                      help="recursively look for training tags in all child folders of train_data_dir / train_data_dirのすべての子フォルダにある学習タグを再帰的に探す")
+  parser.add_argument("--debug", action="store_true", help="debug mode")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args = parser.parse_args()
+  # スペルミスしていたオプションを復元する
+  if args.caption_extention is not None:
+    args.caption_extension = args.caption_extention
+  main(args)

finetune/merge_dd_tags_to_metadata.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import argparse
+import json
+from pathlib import Path
+from typing import List
+from tqdm import tqdm
+import library.train_util as train_util
+import os
+def main(args):
+  assert not args.recursive or (args.recursive and args.full_path), "recursive requires full_path / recursiveはfull_pathと同時に指定してください"
+  train_data_dir_path = Path(args.train_data_dir)
+  image_paths: List[Path] = train_util.glob_images_pathlib(train_data_dir_path, args.recursive)
+  print(f"found {len(image_paths)} images.")
+  if args.in_json is None and Path(args.out_json).is_file():
+    args.in_json = args.out_json
+  if args.in_json is not None:
+    print(f"loading existing metadata: {args.in_json}")
+    metadata = json.loads(Path(args.in_json).read_text(encoding='utf-8'))
+    print("tags data for existing images will be overwritten / 既存の画像のタグは上書きされます")
+  else:
+    print("new metadata will be created / 新しいメタデータファイルが作成されます")
+    metadata = {}
+  print("merge tags to metadata json.")
+  for image_path in tqdm(image_paths):
+    tags_path = image_path.with_suffix(args.caption_extension)
+    tags = tags_path.read_text(encoding='utf-8').strip()
+    if not os.path.exists(tags_path):
+      tags_path = os.path.join(image_path, args.caption_extension)
+    image_key = str(image_path) if args.full_path else image_path.stem
+    if image_key not in metadata:
+      metadata[image_key] = {}
+    metadata[image_key]['tags'] = tags
+    if args.debug:
+      print(image_key, tags)
+  # metadataを書き出して終わり
+  print(f"writing metadata: {args.out_json}")
+  Path(args.out_json).write_text(json.dumps(metadata, indent=2), encoding='utf-8')
+  print("done!")
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+  parser.add_argument("--in_json", type=str,
+                      help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル（省略時、out_jsonが存在すればそれを読み込む）")
+  parser.add_argument("--full_path", action="store_true",
+                      help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）")
+  parser.add_argument("--recursive", action="store_true",
+                      help="recursively look for training tags in all child folders of train_data_dir / train_data_dirのすべての子フォルダにある学習タグを再帰的に探す")
+  parser.add_argument("--caption_extension", type=str, default=".txt",
+                      help="extension of caption (tag) file / 読み込むキャプション（タグ）ファイルの拡張子")
+  parser.add_argument("--debug", action="store_true", help="debug mode, print tags")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args = parser.parse_args()
+  main(args)

finetune/prepare_buckets_latents.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import argparse
+import os
+import json
+from tqdm import tqdm
+import numpy as np
+from PIL import Image
+import cv2
+import torch
+from torchvision import transforms
+import library.model_util as model_util
+import library.train_util as train_util
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+IMAGE_TRANSFORMS = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ]
+)
+def collate_fn_remove_corrupted(batch):
+  """Collate function that allows to remove corrupted examples in the
+  dataloader. It expects that the dataloader returns 'None' when that occurs.
+  The 'None's in the batch are removed.
+  """
+  # Filter out all the Nones (corrupted examples)
+  batch = list(filter(lambda x: x is not None, batch))
+  return batch
+def get_latents(vae, images, weight_dtype):
+  img_tensors = [IMAGE_TRANSFORMS(image) for image in images]
+  img_tensors = torch.stack(img_tensors)
+  img_tensors = img_tensors.to(DEVICE, weight_dtype)
+  with torch.no_grad():
+    latents = vae.encode(img_tensors).latent_dist.sample().float().to("cpu").numpy()
+  return latents
+def get_npz_filename_wo_ext(data_dir, image_key, is_full_path, flip):
+  if is_full_path:
+    base_name = os.path.splitext(os.path.basename(image_key))[0]
+  else:
+    base_name = image_key
+  if flip:
+    base_name += '_flip'
+  return os.path.join(data_dir, base_name)
+def main(args):
+  # assert args.bucket_reso_steps % 8 == 0, f"bucket_reso_steps must be divisible by 8 / bucket_reso_stepは8で割り切れる必要があります"
+  if args.bucket_reso_steps % 8 > 0:
+    print(f"resolution of buckets in training time is a multiple of 8 / 学習時の各bucketの解像度は8単位になります")
+  image_paths = train_util.glob_images(args.train_data_dir)
+  print(f"found {len(image_paths)} images.")
+  if os.path.exists(args.in_json):
+    print(f"loading existing metadata: {args.in_json}")
+    with open(args.in_json, "rt", encoding='utf-8') as f:
+      metadata = json.load(f)
+  else:
+    print(f"no metadata / メタデータファイルがありません: {args.in_json}")
+    return
+  weight_dtype = torch.float32
+  if args.mixed_precision == "fp16":
+    weight_dtype = torch.float16
+  elif args.mixed_precision == "bf16":
+    weight_dtype = torch.bfloat16
+  vae = model_util.load_vae(args.model_name_or_path, weight_dtype)
+  vae.eval()
+  vae.to(DEVICE, dtype=weight_dtype)
+  # bucketのサイズを計算する
+  max_reso = tuple([int(t) for t in args.max_resolution.split(',')])
+  assert len(max_reso) == 2, f"illegal resolution (not 'width,height') / 画像サイズに誤りがあります。'幅,高さ'で指定してください: {args.max_resolution}"
+  bucket_manager = train_util.BucketManager(args.bucket_no_upscale, max_reso,
+                                            args.min_bucket_reso, args.max_bucket_reso, args.bucket_reso_steps)
+  if not args.bucket_no_upscale:
+    bucket_manager.make_buckets()
+  else:
+    print("min_bucket_reso and max_bucket_reso are ignored if bucket_no_upscale is set, because bucket reso is defined by image size automatically / bucket_no_upscaleが指定された場合は、bucketの解像度は画像サイズから自動計算されるため、min_bucket_resoとmax_bucket_resoは無視されます")
+  # 画像をひとつずつ適切なbucketに割り当てながらlatentを計算する
+  img_ar_errors = []
+  def process_batch(is_last):
+    for bucket in bucket_manager.buckets:
+      if (is_last and len(bucket) > 0) or len(bucket) >= args.batch_size:
+        latents = get_latents(vae, [img for _, img in bucket], weight_dtype)
+        assert latents.shape[2] == bucket[0][1].shape[0] // 8 and latents.shape[3] == bucket[0][1].shape[1] // 8, \
+            f"latent shape {latents.shape}, {bucket[0][1].shape}"
+        for (image_key, _), latent in zip(bucket, latents):
+          npz_file_name = get_npz_filename_wo_ext(args.train_data_dir, image_key, args.full_path, False)
+          np.savez(npz_file_name, latent)
+        # flip
+        if args.flip_aug:
+          latents = get_latents(vae, [img[:, ::-1].copy() for _, img in bucket], weight_dtype)   # copyがないとTensor変換できない
+          for (image_key, _), latent in zip(bucket, latents):
+            npz_file_name = get_npz_filename_wo_ext(args.train_data_dir, image_key, args.full_path, True)
+            np.savez(npz_file_name, latent)
+        else:
+          # remove existing flipped npz
+          for image_key, _ in bucket:
+            npz_file_name = get_npz_filename_wo_ext(args.train_data_dir, image_key, args.full_path, True) + ".npz"
+            if os.path.isfile(npz_file_name):
+              print(f"remove existing flipped npz / 既存のflipされたnpzファイルを削除します: {npz_file_name}")
+              os.remove(npz_file_name)
+        bucket.clear()
+  # 読み込みの高速化のためにDataLoaderを使うオプション
+  if args.max_data_loader_n_workers is not None:
+    dataset = train_util.ImageLoadingDataset(image_paths)
+    data = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False,
+                                       num_workers=args.max_data_loader_n_workers, collate_fn=collate_fn_remove_corrupted, drop_last=False)
+  else:
+    data = [[(None, ip)] for ip in image_paths]
+  bucket_counts = {}
+  for data_entry in tqdm(data, smoothing=0.0):
+    if data_entry[0] is None:
+      continue
+    img_tensor, image_path = data_entry[0]
+    if img_tensor is not None:
+      image = transforms.functional.to_pil_image(img_tensor)
+    else:
+      try:
+        image = Image.open(image_path)
+        if image.mode != 'RGB':
+          image = image.convert("RGB")
+      except Exception as e:
+        print(f"Could not load image path / 画像を読み込めません: {image_path}, error: {e}")
+        continue
+    image_key = image_path if args.full_path else os.path.splitext(os.path.basename(image_path))[0]
+    if image_key not in metadata:
+      metadata[image_key] = {}
+    # 本当はこのあとの部分もDataSetに持っていけば高速化できるがいろいろ大変
+    reso, resized_size, ar_error = bucket_manager.select_bucket(image.width, image.height)
+    img_ar_errors.append(abs(ar_error))
+    bucket_counts[reso] = bucket_counts.get(reso, 0) + 1
+    # メタデータに記録する解像度はlatent単位とするので、8単位で切り捨て
+    metadata[image_key]['train_resolution'] = (reso[0] - reso[0] % 8, reso[1] - reso[1] % 8)
+    if not args.bucket_no_upscale:
+      # upscaleを行わないときには、resize後のサイズは、bucketのサイズと、縦横どちらかが同じであることを確認する
+      assert resized_size[0] == reso[0] or resized_size[1] == reso[
+          1], f"internal error, resized size not match: {reso}, {resized_size}, {image.width}, {image.height}"
+      assert resized_size[0] >= reso[0] and resized_size[1] >= reso[
+          1], f"internal error, resized size too small: {reso}, {resized_size}, {image.width}, {image.height}"
+    assert resized_size[0] >= reso[0] and resized_size[1] >= reso[
+        1], f"internal error resized size is small: {resized_size}, {reso}"
+    # 既に存在するファイルがあればshapeを確認して同じならskipする
+    if args.skip_existing:
+      npz_files = [get_npz_filename_wo_ext(args.train_data_dir, image_key, args.full_path, False) + ".npz"]
+      if args.flip_aug:
+        npz_files.append(get_npz_filename_wo_ext(args.train_data_dir, image_key, args.full_path, True) + ".npz")
+      found = True
+      for npz_file in npz_files:
+        if not os.path.exists(npz_file):
+          found = False
+          break
+        dat = np.load(npz_file)['arr_0']
+        if dat.shape[1] != reso[1] // 8 or dat.shape[2] != reso[0] // 8:     # latentsのshapeを確認
+          found = False
+          break
+      if found:
+        continue
+    # 画像をリサイズしてトリミングする
+    # PILにinter_areaがないのでcv2で……
+    image = np.array(image)
+    if resized_size[0] != image.shape[1] or resized_size[1] != image.shape[0]:            # リサイズ処理が必要？
+      image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA)
+    if resized_size[0] > reso[0]:
+      trim_size = resized_size[0] - reso[0]
+      image = image[:, trim_size//2:trim_size//2 + reso[0]]
+    if resized_size[1] > reso[1]:
+      trim_size = resized_size[1] - reso[1]
+      image = image[trim_size//2:trim_size//2 + reso[1]]
+    assert image.shape[0] == reso[1] and image.shape[1] == reso[0], f"internal error, illegal trimmed size: {image.shape}, {reso}"
+    # # debug
+    # cv2.imwrite(f"r:\\test\\img_{len(img_ar_errors)}.jpg", image[:, :, ::-1])
+    # バッチへ追加
+    bucket_manager.add_image(reso, (image_key, image))
+    # バッチを推論するか判定して推論する
+    process_batch(False)
+  # 残りを処理する
+  process_batch(True)
+  bucket_manager.sort()
+  for i, reso in enumerate(bucket_manager.resos):
+    count = bucket_counts.get(reso, 0)
+    if count > 0:
+      print(f"bucket {i} {reso}: {count}")
+  img_ar_errors = np.array(img_ar_errors)
+  print(f"mean ar error: {np.mean(img_ar_errors)}")
+  # metadataを書き出して終わり
+  print(f"writing metadata: {args.out_json}")
+  with open(args.out_json, "wt", encoding='utf-8') as f:
+    json.dump(metadata, f, indent=2)
+  print("done!")
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("in_json", type=str, help="metadata file to input / 読み込むメタデータファイル")
+  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+  parser.add_argument("model_name_or_path", type=str, help="model name or path to encode latents / latentを取得するためのモデル")
+  parser.add_argument("--v2", action='store_true',
+                      help='not used (for backward compatibility) / 使用されません（互換性のため残してあります）')
+  parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
+  parser.add_argument("--max_data_loader_n_workers", type=int, default=None,
+                      help="enable image reading by DataLoader with this number of workers (faster) / DataLoaderによる画像読み込みを有効にしてこのワーカー数を適用する（読み込みを高速化）")
+  parser.add_argument("--max_resolution", type=str, default="512,512",
+                      help="max resolution in fine tuning (width,height) / fine tuning時の最大画像サイズ 「幅,高さ」（使用メモリ量に関係します）")
+  parser.add_argument("--min_bucket_reso", type=int, default=256, help="minimum resolution for buckets / bucketの最小解像度")
+  parser.add_argument("--max_bucket_reso", type=int, default=1024, help="maximum resolution for buckets / bucketの最小解像度")
+  parser.add_argument("--bucket_reso_steps", type=int, default=64,
+                      help="steps of resolution for buckets, divisible by 8 is recommended / bucketの解像度の単位、8で割り切れる値を推奨します")
+  parser.add_argument("--bucket_no_upscale", action="store_true",
+                      help="make bucket for each image without upscaling / 画像を拡大せずbucketを作成します")
+  parser.add_argument("--mixed_precision", type=str, default="no",
+                      choices=["no", "fp16", "bf16"], help="use mixed precision / 混合精度を使う場合、その精度")
+  parser.add_argument("--full_path", action="store_true",
+                      help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）")
+  parser.add_argument("--flip_aug", action="store_true",
+                      help="flip augmentation, save latents for flipped images / 左右反転した画像もlatentを取得、保存する")
+  parser.add_argument("--skip_existing", action="store_true",
+                      help="skip images if npz already exists (both normal and flipped exists if flip_aug is enabled) / npzが既に存在する画像をスキップする（flip_aug有効時は通常、反転の両方が存在する画像をスキップ）")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args = parser.parse_args()
+  main(args)

finetune/tag_images_by_wd14_tagger.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import argparse
+import csv
+import glob
+import os
+from PIL import Image
+import cv2
+from tqdm import tqdm
+import numpy as np
+from tensorflow.keras.models import load_model
+from huggingface_hub import hf_hub_download
+import torch
+import library.train_util as train_util
+# from wd14 tagger
+IMAGE_SIZE = 448
+# wd-v1-4-swinv2-tagger-v2 / wd-v1-4-vit-tagger / wd-v1-4-vit-tagger-v2/ wd-v1-4-convnext-tagger / wd-v1-4-convnext-tagger-v2
+DEFAULT_WD14_TAGGER_REPO = 'SmilingWolf/wd-v1-4-convnext-tagger-v2'
+FILES = ["keras_metadata.pb", "saved_model.pb", "selected_tags.csv"]
+SUB_DIR = "variables"
+SUB_DIR_FILES = ["variables.data-00000-of-00001", "variables.index"]
+CSV_FILE = FILES[-1]
+def preprocess_image(image):
+  image = np.array(image)
+  image = image[:, :, ::-1]                         # RGB->BGR
+  # pad to square
+  size = max(image.shape[0:2])
+  pad_x = size - image.shape[1]
+  pad_y = size - image.shape[0]
+  pad_l = pad_x // 2
+  pad_t = pad_y // 2
+  image = np.pad(image, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode='constant', constant_values=255)
+  interp = cv2.INTER_AREA if size > IMAGE_SIZE else cv2.INTER_LANCZOS4
+  image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE), interpolation=interp)
+  image = image.astype(np.float32)
+  return image
+class ImageLoadingPrepDataset(torch.utils.data.Dataset):
+  def __init__(self, image_paths):
+    self.images = image_paths
+  def __len__(self):
+    return len(self.images)
+  def __getitem__(self, idx):
+    img_path = self.images[idx]
+    try:
+      image = Image.open(img_path).convert("RGB")
+      image = preprocess_image(image)
+      tensor = torch.tensor(image)
+    except Exception as e:
+      print(f"Could not load image path / 画像を読み込めません: {img_path}, error: {e}")
+      return None
+    return (tensor, img_path)
+def collate_fn_remove_corrupted(batch):
+  """Collate function that allows to remove corrupted examples in the
+  dataloader. It expects that the dataloader returns 'None' when that occurs.
+  The 'None's in the batch are removed.
+  """
+  # Filter out all the Nones (corrupted examples)
+  batch = list(filter(lambda x: x is not None, batch))
+  return batch
+def main(args):
+  # hf_hub_downloadをそのまま使うとsymlink関係で問題があるらしいので、キャッシュディレクトリとforce_filenameを指定してなんとかする
+  # depreacatedの警告が出るけどなくなったらその時
+  # https://github.com/toriato/stable-diffusion-webui-wd14-tagger/issues/22
+  if not os.path.exists(args.model_dir) or args.force_download:
+    print(f"downloading wd14 tagger model from hf_hub. id: {args.repo_id}")
+    for file in FILES:
+      hf_hub_download(args.repo_id, file, cache_dir=args.model_dir, force_download=True, force_filename=file)
+    for file in SUB_DIR_FILES:
+      hf_hub_download(args.repo_id, file, subfolder=SUB_DIR, cache_dir=os.path.join(
+          args.model_dir, SUB_DIR), force_download=True, force_filename=file)
+  else:
+    print("using existing wd14 tagger model")
+  # 画像を読み込む
+  image_paths = train_util.glob_images(args.train_data_dir)
+  print(f"found {len(image_paths)} images.")
+  print("loading model and labels")
+  model = load_model(args.model_dir)
+  # label_names = pd.read_csv("2022_0000_0899_6549/selected_tags.csv")
+  # 依存ライブラリを増やしたくないので自力で読むよ
+  with open(os.path.join(args.model_dir, CSV_FILE), "r", encoding="utf-8") as f:
+    reader = csv.reader(f)
+    l = [row for row in reader]
+    header = l[0]             # tag_id,name,category,count
+    rows = l[1:]
+  assert header[0] == 'tag_id' and header[1] == 'name' and header[2] == 'category', f"unexpected csv format: {header}"
+  tags = [row[1] for row in rows[1:] if row[2] == '0']      # categoryが0、つまり通常のタグのみ
+  # 推論する
+  def run_batch(path_imgs):
+    imgs = np.array([im for _, im in path_imgs])
+    probs = model(imgs, training=False)
+    probs = probs.numpy()
+    for (image_path, _), prob in zip(path_imgs, probs):
+      # 最初の4つはratingなので無視する
+      # # First 4 labels are actually ratings: pick one with argmax
+      # ratings_names = label_names[:4]
+      # rating_index = ratings_names["probs"].argmax()
+      # found_rating = ratings_names[rating_index: rating_index + 1][["name", "probs"]]
+      # それ以降はタグなのでconfidenceがthresholdより高いものを追加する
+      # Everything else is tags: pick any where prediction confidence > threshold
+      tag_text = ""
+      for i, p in enumerate(prob[4:]):                # numpyとか使うのが良いけど、まあそれほど数も多くないのでループで
+        if p >= args.thresh and i < len(tags):
+          tag_text += ", " + tags[i]
+      if len(tag_text) > 0:
+        tag_text = tag_text[2:]                   # 最初の ", " を消す
+      with open(os.path.splitext(image_path)[0] + args.caption_extension, "wt", encoding='utf-8') as f:
+        f.write(tag_text + '\n')
+        if args.debug:
+          print(image_path, tag_text)
+  # 読み込みの高速化のためにDataLoaderを使うオプション
+  if args.max_data_loader_n_workers is not None:
+    dataset = ImageLoadingPrepDataset(image_paths)
+    data = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False,
+                                       num_workers=args.max_data_loader_n_workers, collate_fn=collate_fn_remove_corrupted, drop_last=False)
+  else:
+    data = [[(None, ip)] for ip in image_paths]
+  b_imgs = []
+  for data_entry in tqdm(data, smoothing=0.0):
+    for data in data_entry:
+      if data is None:
+        continue
+      image, image_path = data
+      if image is not None:
+        image = image.detach().numpy()
+      else:
+        try:
+          image = Image.open(image_path)
+          if image.mode != 'RGB':
+            image = image.convert("RGB")
+          image = preprocess_image(image)
+        except Exception as e:
+          print(f"Could not load image path / 画像を読み込めません: {image_path}, error: {e}")
+          continue
+      b_imgs.append((image_path, image))
+      if len(b_imgs) >= args.batch_size:
+        run_batch(b_imgs)
+        b_imgs.clear()
+  if len(b_imgs) > 0:
+    run_batch(b_imgs)
+  print("done!")
+def setup_parser() -> argparse.ArgumentParser:
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("--repo_id", type=str, default=DEFAULT_WD14_TAGGER_REPO,
+                      help="repo id for wd14 tagger on Hugging Face / Hugging Faceのwd14 taggerのリポジトリID")
+  parser.add_argument("--model_dir", type=str, default="wd14_tagger_model",
+                      help="directory to store wd14 tagger model / wd14 taggerのモデルを格納するディレクトリ")
+  parser.add_argument("--force_download", action='store_true',
+                      help="force downloading wd14 tagger models / wd14 taggerのモデルを再ダウンロードします")
+  parser.add_argument("--thresh", type=float, default=0.35, help="threshold of confidence to add a tag / タグを追加するか判定する閾値")
+  parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
+  parser.add_argument("--max_data_loader_n_workers", type=int, default=None,
+                      help="enable image reading by DataLoader with this number of workers (faster) / DataLoaderによる画像読み込みを有効にしてこのワーカー数を適用する（読み込みを高速化）")
+  parser.add_argument("--caption_extention", type=str, default=None,
+                      help="extension of caption file (for backward compatibility) / 出力されるキャプションファイルの拡張子（スペルミスしていたのを残してあります）")
+  parser.add_argument("--caption_extension", type=str, default=".txt", help="extension of caption file / 出力されるキャプションファイルの拡張子")
+  parser.add_argument("--debug", action="store_true", help="debug mode")
+  return parser
+if __name__ == '__main__':
+  parser = setup_parser()
+  args = parser.parse_args()
+  # スペルミスしていたオプションを復元する
+  if args.caption_extention is not None:
+    args.caption_extension = args.caption_extention
+  main(args)

finetune_gui.py ADDED Viewed

	@@ -0,0 +1,888 @@

+import gradio as gr
+import json
+import math
+import os
+import subprocess
+import pathlib
+import argparse
+from library.common_gui import (
+    get_folder_path,
+    get_file_path,
+    get_saveasfile_path,
+    save_inference_file,
+    gradio_advanced_training,
+    run_cmd_advanced_training,
+    gradio_training,
+    run_cmd_advanced_training,
+    gradio_config,
+    gradio_source_model,
+    color_aug_changed,
+    run_cmd_training,
+    # set_legacy_8bitadam,
+    update_my_data,
+    check_if_model_exist,
+)
+from library.tensorboard_gui import (
+    gradio_tensorboard,
+    start_tensorboard,
+    stop_tensorboard,
+)
+from library.utilities import utilities_tab
+from library.sampler_gui import sample_gradio_config, run_cmd_sample
+folder_symbol = '\U0001f4c2'  # 📂
+refresh_symbol = '\U0001f504'  # 🔄
+save_style_symbol = '\U0001f4be'  # 💾
+document_symbol = '\U0001F4C4'   # 📄
+PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
+def save_configuration(
+    save_as,
+    file_path,
+    pretrained_model_name_or_path,
+    v2,
+    v_parameterization,
+    train_dir,
+    image_folder,
+    output_dir,
+    logging_dir,
+    max_resolution,
+    min_bucket_reso,
+    max_bucket_reso,
+    batch_size,
+    flip_aug,
+    caption_metadata_filename,
+    latent_metadata_filename,
+    full_path,
+    learning_rate,
+    lr_scheduler,
+    lr_warmup,
+    dataset_repeats,
+    train_batch_size,
+    epoch,
+    save_every_n_epochs,
+    mixed_precision,
+    save_precision,
+    seed,
+    num_cpu_threads_per_process,
+    train_text_encoder,
+    create_caption,
+    create_buckets,
+    save_model_as,
+    caption_extension,
+    # use_8bit_adam,
+    xformers,
+    clip_skip,
+    save_state,
+    resume,
+    gradient_checkpointing,
+    gradient_accumulation_steps,
+    mem_eff_attn,
+    shuffle_caption,
+    output_name,
+    max_token_length,
+    max_train_epochs,
+    max_data_loader_n_workers,
+    full_fp16,
+    color_aug,
+    model_list,
+    cache_latents,
+    use_latent_files,
+    keep_tokens,
+    persistent_data_loader_workers,
+    bucket_no_upscale,
+    random_crop,
+    bucket_reso_steps,
+    caption_dropout_every_n_epochs,
+    caption_dropout_rate,
+    optimizer,
+    optimizer_args,
+    noise_offset,
+    sample_every_n_steps,
+    sample_every_n_epochs,
+    sample_sampler,
+    sample_prompts,
+    additional_parameters,
+    vae_batch_size,
+    min_snr_gamma,
+):
+    # Get list of function parameters and values
+    parameters = list(locals().items())
+    original_file_path = file_path
+    save_as_bool = True if save_as.get('label') == 'True' else False
+    if save_as_bool:
+        print('Save as...')
+        file_path = get_saveasfile_path(file_path)
+    else:
+        print('Save...')
+        if file_path == None or file_path == '':
+            file_path = get_saveasfile_path(file_path)
+    # print(file_path)
+    if file_path == None or file_path == '':
+        return original_file_path  # In case a file_path was provided and the user decide to cancel the open action
+    # Return the values of the variables as a dictionary
+    variables = {
+        name: value
+        for name, value in parameters  # locals().items()
+        if name
+        not in [
+            'file_path',
+            'save_as',
+        ]
+    }
+    # Extract the destination directory from the file path
+    destination_directory = os.path.dirname(file_path)
+    # Create the destination directory if it doesn't exist
+    if not os.path.exists(destination_directory):
+        os.makedirs(destination_directory)
+    # Save the data to the selected file
+    with open(file_path, 'w') as file:
+        json.dump(variables, file, indent=2)
+    return file_path
+def open_configuration(
+    ask_for_file,
+    file_path,
+    pretrained_model_name_or_path,
+    v2,
+    v_parameterization,
+    train_dir,
+    image_folder,
+    output_dir,
+    logging_dir,
+    max_resolution,
+    min_bucket_reso,
+    max_bucket_reso,
+    batch_size,
+    flip_aug,
+    caption_metadata_filename,
+    latent_metadata_filename,
+    full_path,
+    learning_rate,
+    lr_scheduler,
+    lr_warmup,
+    dataset_repeats,
+    train_batch_size,
+    epoch,
+    save_every_n_epochs,
+    mixed_precision,
+    save_precision,
+    seed,
+    num_cpu_threads_per_process,
+    train_text_encoder,
+    create_caption,
+    create_buckets,
+    save_model_as,
+    caption_extension,
+    # use_8bit_adam,
+    xformers,
+    clip_skip,
+    save_state,
+    resume,
+    gradient_checkpointing,
+    gradient_accumulation_steps,
+    mem_eff_attn,
+    shuffle_caption,
+    output_name,
+    max_token_length,
+    max_train_epochs,
+    max_data_loader_n_workers,
+    full_fp16,
+    color_aug,
+    model_list,
+    cache_latents,
+    use_latent_files,
+    keep_tokens,
+    persistent_data_loader_workers,
+    bucket_no_upscale,
+    random_crop,
+    bucket_reso_steps,
+    caption_dropout_every_n_epochs,
+    caption_dropout_rate,
+    optimizer,
+    optimizer_args,
+    noise_offset,
+    sample_every_n_steps,
+    sample_every_n_epochs,
+    sample_sampler,
+    sample_prompts,
+    additional_parameters,
+    vae_batch_size,
+    min_snr_gamma,
+):
+    # Get list of function parameters and values
+    parameters = list(locals().items())
+    ask_for_file = True if ask_for_file.get('label') == 'True' else False
+    original_file_path = file_path
+    if ask_for_file:
+        file_path = get_file_path(file_path)
+    if not file_path == '' and not file_path == None:
+        # load variables from JSON file
+        with open(file_path, 'r') as f:
+            my_data = json.load(f)
+            print('Loading config...')
+            # Update values to fix deprecated use_8bit_adam checkbox and set appropriate optimizer if it is set to True
+            my_data = update_my_data(my_data)
+    else:
+        file_path = original_file_path  # In case a file_path was provided and the user decide to cancel the open action
+        my_data = {}
+    values = [file_path]
+    for key, value in parameters:
+        # Set the value in the dictionary to the corresponding value in `my_data`, or the default value if not found
+        if not key in ['ask_for_file', 'file_path']:
+            values.append(my_data.get(key, value))
+    return tuple(values)
+def train_model(
+    pretrained_model_name_or_path,
+    v2,
+    v_parameterization,
+    train_dir,
+    image_folder,
+    output_dir,
+    logging_dir,
+    max_resolution,
+    min_bucket_reso,
+    max_bucket_reso,
+    batch_size,
+    flip_aug,
+    caption_metadata_filename,
+    latent_metadata_filename,
+    full_path,
+    learning_rate,
+    lr_scheduler,
+    lr_warmup,
+    dataset_repeats,
+    train_batch_size,
+    epoch,
+    save_every_n_epochs,
+    mixed_precision,
+    save_precision,
+    seed,
+    num_cpu_threads_per_process,
+    train_text_encoder,
+    generate_caption_database,
+    generate_image_buckets,
+    save_model_as,
+    caption_extension,
+    # use_8bit_adam,
+    xformers,
+    clip_skip,
+    save_state,
+    resume,
+    gradient_checkpointing,
+    gradient_accumulation_steps,
+    mem_eff_attn,
+    shuffle_caption,
+    output_name,
+    max_token_length,
+    max_train_epochs,
+    max_data_loader_n_workers,
+    full_fp16,
+    color_aug,
+    model_list,  # Keep this. Yes, it is unused here but required given the common list used
+    cache_latents,
+    use_latent_files,
+    keep_tokens,
+    persistent_data_loader_workers,
+    bucket_no_upscale,
+    random_crop,
+    bucket_reso_steps,
+    caption_dropout_every_n_epochs,
+    caption_dropout_rate,
+    optimizer,
+    optimizer_args,
+    noise_offset,
+    sample_every_n_steps,
+    sample_every_n_epochs,
+    sample_sampler,
+    sample_prompts,
+    additional_parameters,
+    vae_batch_size,
+    min_snr_gamma,
+):
+    if check_if_model_exist(output_name, output_dir, save_model_as):
+        return
+    # create caption json file
+    if generate_caption_database:
+        if not os.path.exists(train_dir):
+            os.mkdir(train_dir)
+        run_cmd = f'{PYTHON} finetune/merge_captions_to_metadata.py'
+        if caption_extension == '':
+            run_cmd += f' --caption_extension=".caption"'
+        else:
+            run_cmd += f' --caption_extension={caption_extension}'
+        run_cmd += f' "{image_folder}"'
+        run_cmd += f' "{train_dir}/{caption_metadata_filename}"'
+        if full_path:
+            run_cmd += f' --full_path'
+        print(run_cmd)
+        # Run the command
+        if os.name == 'posix':
+            os.system(run_cmd)
+        else:
+            subprocess.run(run_cmd)
+    # create images buckets
+    if generate_image_buckets:
+        run_cmd = f'{PYTHON} finetune/prepare_buckets_latents.py'
+        run_cmd += f' "{image_folder}"'
+        run_cmd += f' "{train_dir}/{caption_metadata_filename}"'
+        run_cmd += f' "{train_dir}/{latent_metadata_filename}"'
+        run_cmd += f' "{pretrained_model_name_or_path}"'
+        run_cmd += f' --batch_size={batch_size}'
+        run_cmd += f' --max_resolution={max_resolution}'
+        run_cmd += f' --min_bucket_reso={min_bucket_reso}'
+        run_cmd += f' --max_bucket_reso={max_bucket_reso}'
+        run_cmd += f' --mixed_precision={mixed_precision}'
+        # if flip_aug:
+        #     run_cmd += f' --flip_aug'
+        if full_path:
+            run_cmd += f' --full_path'
+        print(run_cmd)
+        # Run the command
+        if os.name == 'posix':
+            os.system(run_cmd)
+        else:
+            subprocess.run(run_cmd)
+    image_num = len(
+        [
+            f
+            for f, lower_f in (
+                (file, file.lower()) for file in os.listdir(image_folder)
+            )
+            if lower_f.endswith(('.jpg', '.jpeg', '.png', '.webp'))
+        ]
+    )
+    print(f'image_num = {image_num}')
+    repeats = int(image_num) * int(dataset_repeats)
+    print(f'repeats = {str(repeats)}')
+    # calculate max_train_steps
+    max_train_steps = int(
+        math.ceil(float(repeats) / int(train_batch_size) * int(epoch))
+    )
+    # Divide by two because flip augmentation create two copied of the source images
+    if flip_aug:
+        max_train_steps = int(math.ceil(float(max_train_steps) / 2))
+    print(f'max_train_steps = {max_train_steps}')
+    lr_warmup_steps = round(float(int(lr_warmup) * int(max_train_steps) / 100))
+    print(f'lr_warmup_steps = {lr_warmup_steps}')
+    run_cmd = f'accelerate launch --num_cpu_threads_per_process={num_cpu_threads_per_process} "./fine_tune.py"'
+    if v2:
+        run_cmd += ' --v2'
+    if v_parameterization:
+        run_cmd += ' --v_parameterization'
+    if train_text_encoder:
+        run_cmd += ' --train_text_encoder'
+    run_cmd += (
+        f' --pretrained_model_name_or_path="{pretrained_model_name_or_path}"'
+    )
+    if use_latent_files == 'Yes':
+        run_cmd += f' --in_json="{train_dir}/{latent_metadata_filename}"'
+    else:
+        run_cmd += f' --in_json="{train_dir}/{caption_metadata_filename}"'
+    run_cmd += f' --train_data_dir="{image_folder}"'
+    run_cmd += f' --output_dir="{output_dir}"'
+    if not logging_dir == '':
+        run_cmd += f' --logging_dir="{logging_dir}"'
+    run_cmd += f' --dataset_repeats={dataset_repeats}'
+    run_cmd += f' --learning_rate={learning_rate}'
+    run_cmd += ' --enable_bucket'
+    run_cmd += f' --resolution={max_resolution}'
+    run_cmd += f' --min_bucket_reso={min_bucket_reso}'
+    run_cmd += f' --max_bucket_reso={max_bucket_reso}'
+    if not save_model_as == 'same as source model':
+        run_cmd += f' --save_model_as={save_model_as}'
+    if int(gradient_accumulation_steps) > 1:
+        run_cmd += f' --gradient_accumulation_steps={int(gradient_accumulation_steps)}'
+    # if save_state:
+    #     run_cmd += ' --save_state'
+    # if not resume == '':
+    #     run_cmd += f' --resume={resume}'
+    if not output_name == '':
+        run_cmd += f' --output_name="{output_name}"'
+    if int(max_token_length) > 75:
+        run_cmd += f' --max_token_length={max_token_length}'
+    run_cmd += run_cmd_training(
+        learning_rate=learning_rate,
+        lr_scheduler=lr_scheduler,
+        lr_warmup_steps=lr_warmup_steps,
+        train_batch_size=train_batch_size,
+        max_train_steps=max_train_steps,
+        save_every_n_epochs=save_every_n_epochs,
+        mixed_precision=mixed_precision,
+        save_precision=save_precision,
+        seed=seed,
+        caption_extension=caption_extension,
+        cache_latents=cache_latents,
+        optimizer=optimizer,
+        optimizer_args=optimizer_args,
+    )
+    run_cmd += run_cmd_advanced_training(
+        max_train_epochs=max_train_epochs,
+        max_data_loader_n_workers=max_data_loader_n_workers,
+        max_token_length=max_token_length,
+        resume=resume,
+        save_state=save_state,
+        mem_eff_attn=mem_eff_attn,
+        clip_skip=clip_skip,
+        flip_aug=flip_aug,
+        color_aug=color_aug,
+        shuffle_caption=shuffle_caption,
+        gradient_checkpointing=gradient_checkpointing,
+        full_fp16=full_fp16,
+        xformers=xformers,
+        # use_8bit_adam=use_8bit_adam,
+        keep_tokens=keep_tokens,
+        persistent_data_loader_workers=persistent_data_loader_workers,
+        bucket_no_upscale=bucket_no_upscale,
+        random_crop=random_crop,
+        bucket_reso_steps=bucket_reso_steps,
+        caption_dropout_every_n_epochs=caption_dropout_every_n_epochs,
+        caption_dropout_rate=caption_dropout_rate,
+        noise_offset=noise_offset,
+        additional_parameters=additional_parameters,
+        vae_batch_size=vae_batch_size,
+        min_snr_gamma=min_snr_gamma,
+    )
+    run_cmd += run_cmd_sample(
+        sample_every_n_steps,
+        sample_every_n_epochs,
+        sample_sampler,
+        sample_prompts,
+        output_dir,
+    )
+    print(run_cmd)
+    # Run the command
+    if os.name == 'posix':
+        os.system(run_cmd)
+    else:
+        subprocess.run(run_cmd)
+    # check if output_dir/last is a folder... therefore it is a diffuser model
+    last_dir = pathlib.Path(f'{output_dir}/{output_name}')
+    if not last_dir.is_dir():
+        # Copy inference model for v2 if required
+        save_inference_file(output_dir, v2, v_parameterization, output_name)
+def remove_doublequote(file_path):
+    if file_path != None:
+        file_path = file_path.replace('"', '')
+    return file_path
+def finetune_tab():
+    dummy_db_true = gr.Label(value=True, visible=False)
+    dummy_db_false = gr.Label(value=False, visible=False)
+    gr.Markdown('Train a custom model using kohya finetune python code...')
+    (
+        button_open_config,
+        button_save_config,
+        button_save_as_config,
+        config_file_name,
+        button_load_config,
+    ) = gradio_config()
+    (
+        pretrained_model_name_or_path,
+        v2,
+        v_parameterization,
+        save_model_as,
+        model_list,
+    ) = gradio_source_model()
+    with gr.Tab('Folders'):
+        with gr.Row():
+            train_dir = gr.Textbox(
+                label='Training config folder',
+                placeholder='folder where the training configuration files will be saved',
+            )
+            train_dir_folder = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            train_dir_folder.click(
+                get_folder_path,
+                outputs=train_dir,
+                show_progress=False,
+            )
+            image_folder = gr.Textbox(
+                label='Training Image folder',
+                placeholder='folder where the training images are located',
+            )
+            image_folder_input_folder = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            image_folder_input_folder.click(
+                get_folder_path,
+                outputs=image_folder,
+                show_progress=False,
+            )
+        with gr.Row():
+            output_dir = gr.Textbox(
+                label='Model output folder',
+                placeholder='folder where the model will be saved',
+            )
+            output_dir_input_folder = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            output_dir_input_folder.click(
+                get_folder_path,
+                outputs=output_dir,
+                show_progress=False,
+            )
+            logging_dir = gr.Textbox(
+                label='Logging folder',
+                placeholder='Optional: enable logging and output TensorBoard log to this folder',
+            )
+            logging_dir_input_folder = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            logging_dir_input_folder.click(
+                get_folder_path,
+                outputs=logging_dir,
+                show_progress=False,
+            )
+        with gr.Row():
+            output_name = gr.Textbox(
+                label='Model output name',
+                placeholder='Name of the model to output',
+                value='last',
+                interactive=True,
+            )
+        train_dir.change(
+            remove_doublequote,
+            inputs=[train_dir],
+            outputs=[train_dir],
+        )
+        image_folder.change(
+            remove_doublequote,
+            inputs=[image_folder],
+            outputs=[image_folder],
+        )
+        output_dir.change(
+            remove_doublequote,
+            inputs=[output_dir],
+            outputs=[output_dir],
+        )
+    with gr.Tab('Dataset preparation'):
+        with gr.Row():
+            max_resolution = gr.Textbox(
+                label='Resolution (width,height)', value='512,512'
+            )
+            min_bucket_reso = gr.Textbox(
+                label='Min bucket resolution', value='256'
+            )
+            max_bucket_reso = gr.Textbox(
+                label='Max bucket resolution', value='1024'
+            )
+            batch_size = gr.Textbox(label='Batch size', value='1')
+        with gr.Row():
+            create_caption = gr.Checkbox(
+                label='Generate caption metadata', value=True
+            )
+            create_buckets = gr.Checkbox(
+                label='Generate image buckets metadata', value=True
+            )
+            use_latent_files = gr.Dropdown(
+                label='Use latent files',
+                choices=[
+                    'No',
+                    'Yes',
+                ],
+                value='Yes',
+            )
+        with gr.Accordion('Advanced parameters', open=False):
+            with gr.Row():
+                caption_metadata_filename = gr.Textbox(
+                    label='Caption metadata filename', value='meta_cap.json'
+                )
+                latent_metadata_filename = gr.Textbox(
+                    label='Latent metadata filename', value='meta_lat.json'
+                )
+                full_path = gr.Checkbox(label='Use full path', value=True)
+    with gr.Tab('Training parameters'):
+        (
+            learning_rate,
+            lr_scheduler,
+            lr_warmup,
+            train_batch_size,
+            epoch,
+            save_every_n_epochs,
+            mixed_precision,
+            save_precision,
+            num_cpu_threads_per_process,
+            seed,
+            caption_extension,
+            cache_latents,
+            optimizer,
+            optimizer_args,
+        ) = gradio_training(learning_rate_value='1e-5')
+        with gr.Row():
+            dataset_repeats = gr.Textbox(label='Dataset repeats', value=40)
+            train_text_encoder = gr.Checkbox(
+                label='Train text encoder', value=True
+            )
+        with gr.Accordion('Advanced parameters', open=False):
+            with gr.Row():
+                gradient_accumulation_steps = gr.Number(
+                    label='Gradient accumulate steps', value='1'
+                )
+            (
+                # use_8bit_adam,
+                xformers,
+                full_fp16,
+                gradient_checkpointing,
+                shuffle_caption,
+                color_aug,
+                flip_aug,
+                clip_skip,
+                mem_eff_attn,
+                save_state,
+                resume,
+                max_token_length,
+                max_train_epochs,
+                max_data_loader_n_workers,
+                keep_tokens,
+                persistent_data_loader_workers,
+                bucket_no_upscale,
+                random_crop,
+                bucket_reso_steps,
+                caption_dropout_every_n_epochs,
+                caption_dropout_rate,
+                noise_offset,
+                additional_parameters,
+                vae_batch_size,
+                min_snr_gamma,
+            ) = gradio_advanced_training()
+            color_aug.change(
+                color_aug_changed,
+                inputs=[color_aug],
+                outputs=[cache_latents],  # Not applicable to fine_tune.py
+            )
+        (
+            sample_every_n_steps,
+            sample_every_n_epochs,
+            sample_sampler,
+            sample_prompts,
+        ) = sample_gradio_config()
+    button_run = gr.Button('Train model', variant='primary')
+    # Setup gradio tensorboard buttons
+    button_start_tensorboard, button_stop_tensorboard = gradio_tensorboard()
+    button_start_tensorboard.click(
+        start_tensorboard,
+        inputs=logging_dir,
+    )
+    button_stop_tensorboard.click(
+        stop_tensorboard,
+        show_progress=False,
+    )
+    settings_list = [
+        pretrained_model_name_or_path,
+        v2,
+        v_parameterization,
+        train_dir,
+        image_folder,
+        output_dir,
+        logging_dir,
+        max_resolution,
+        min_bucket_reso,
+        max_bucket_reso,
+        batch_size,
+        flip_aug,
+        caption_metadata_filename,
+        latent_metadata_filename,
+        full_path,
+        learning_rate,
+        lr_scheduler,
+        lr_warmup,
+        dataset_repeats,
+        train_batch_size,
+        epoch,
+        save_every_n_epochs,
+        mixed_precision,
+        save_precision,
+        seed,
+        num_cpu_threads_per_process,
+        train_text_encoder,
+        create_caption,
+        create_buckets,
+        save_model_as,
+        caption_extension,
+        # use_8bit_adam,
+        xformers,
+        clip_skip,
+        save_state,
+        resume,
+        gradient_checkpointing,
+        gradient_accumulation_steps,
+        mem_eff_attn,
+        shuffle_caption,
+        output_name,
+        max_token_length,
+        max_train_epochs,
+        max_data_loader_n_workers,
+        full_fp16,
+        color_aug,
+        model_list,
+        cache_latents,
+        use_latent_files,
+        keep_tokens,
+        persistent_data_loader_workers,
+        bucket_no_upscale,
+        random_crop,
+        bucket_reso_steps,
+        caption_dropout_every_n_epochs,
+        caption_dropout_rate,
+        optimizer,
+        optimizer_args,
+        noise_offset,
+        sample_every_n_steps,
+        sample_every_n_epochs,
+        sample_sampler,
+        sample_prompts,
+        additional_parameters,
+        vae_batch_size,
+        min_snr_gamma,
+    ]
+    button_run.click(train_model, inputs=settings_list)
+    button_open_config.click(
+        open_configuration,
+        inputs=[dummy_db_true, config_file_name] + settings_list,
+        outputs=[config_file_name] + settings_list,
+        show_progress=False,
+    )
+    button_load_config.click(
+        open_configuration,
+        inputs=[dummy_db_false, config_file_name] + settings_list,
+        outputs=[config_file_name] + settings_list,
+        show_progress=False,
+    )
+    button_save_config.click(
+        save_configuration,
+        inputs=[dummy_db_false, config_file_name] + settings_list,
+        outputs=[config_file_name],
+        show_progress=False,
+    )
+    button_save_as_config.click(
+        save_configuration,
+        inputs=[dummy_db_true, config_file_name] + settings_list,
+        outputs=[config_file_name],
+        show_progress=False,
+    )
+def UI(**kwargs):
+    css = ''
+    if os.path.exists('./style.css'):
+        with open(os.path.join('./style.css'), 'r', encoding='utf8') as file:
+            print('Load CSS...')
+            css += file.read() + '\n'
+    interface = gr.Blocks(css=css)
+    with interface:
+        with gr.Tab('Finetune'):
+            finetune_tab()
+        with gr.Tab('Utilities'):
+            utilities_tab(enable_dreambooth_tab=False)
+    # Show the interface
+    launch_kwargs = {}
+    if not kwargs.get('username', None) == '':
+        launch_kwargs['auth'] = (
+            kwargs.get('username', None),
+            kwargs.get('password', None),
+        )
+    if kwargs.get('server_port', 0) > 0:
+        launch_kwargs['server_port'] = kwargs.get('server_port', 0)
+    if kwargs.get('inbrowser', False):
+        launch_kwargs['inbrowser'] = kwargs.get('inbrowser', False)
+    print(launch_kwargs)
+    interface.launch(**launch_kwargs)
+if __name__ == '__main__':
+    # torch.cuda.set_per_process_memory_fraction(0.48)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--username', type=str, default='', help='Username for authentication'
+    )
+    parser.add_argument(
+        '--password', type=str, default='', help='Password for authentication'
+    )
+    parser.add_argument(
+        '--server_port',
+        type=int,
+        default=0,
+        help='Port to run the server listener on',
+    )
+    parser.add_argument(
+        '--inbrowser', action='store_true', help='Open in browser'
+    )
+    args = parser.parse_args()
+    UI(
+        username=args.username,
+        password=args.password,
+        inbrowser=args.inbrowser,
+        server_port=args.server_port,
+    )

gen_img_diffusers.py ADDED Viewed

The diff for this file is too large to render. See raw diff

gui.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/usr/bin/env bash
+# Activate the virtual environment
+source ./venv/bin/activate
+# If the requirements are validated, run the kohya_gui.py script with the command-line arguments
+if python tools/validate_requirements.py; then
+    python kohya_gui.py "$@"
+fi

kohya_gui.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import gradio as gr
+import os
+import argparse
+from dreambooth_gui import dreambooth_tab
+from finetune_gui import finetune_tab
+from textual_inversion_gui import ti_tab
+from library.utilities import utilities_tab
+from library.extract_lora_gui import gradio_extract_lora_tab
+from library.extract_lycoris_locon_gui import gradio_extract_lycoris_locon_tab
+from library.merge_lora_gui import gradio_merge_lora_tab
+from library.resize_lora_gui import gradio_resize_lora_tab
+from lora_gui import lora_tab
+def UI(**kwargs):
+    css = ''
+    if os.path.exists('./style.css'):
+        with open(os.path.join('./style.css'), 'r', encoding='utf8') as file:
+            print('Load CSS...')
+            css += file.read() + '\n'
+    interface = gr.Blocks(css=css, title='Kohya_ss GUI')
+    with interface:
+        with gr.Tab('Dreambooth'):
+            (
+                train_data_dir_input,
+                reg_data_dir_input,
+                output_dir_input,
+                logging_dir_input,
+            ) = dreambooth_tab()
+        with gr.Tab('Dreambooth LoRA'):
+            lora_tab()
+        with gr.Tab('Dreambooth TI'):
+            ti_tab()
+        with gr.Tab('Finetune'):
+            finetune_tab()
+        with gr.Tab('Utilities'):
+            utilities_tab(
+                train_data_dir_input=train_data_dir_input,
+                reg_data_dir_input=reg_data_dir_input,
+                output_dir_input=output_dir_input,
+                logging_dir_input=logging_dir_input,
+                enable_copy_info_button=True,
+            )
+            gradio_extract_lora_tab()
+            gradio_extract_lycoris_locon_tab()
+            gradio_merge_lora_tab()
+            gradio_resize_lora_tab()
+    # Show the interface
+    launch_kwargs = {}
+    username = kwargs.get('username')
+    password = kwargs.get('password')
+    server_port = kwargs.get('server_port', 0)
+    inbrowser = kwargs.get('inbrowser', False)
+    share = kwargs.get('share', False)
+    server_name = kwargs.get('listen')
+    launch_kwargs['server_name'] = server_name
+    if username and password:
+        launch_kwargs['auth'] = (username, password)
+    if server_port > 0:
+        launch_kwargs['server_port'] = server_port
+    if inbrowser:
+        launch_kwargs['inbrowser'] = inbrowser
+    if share:
+        launch_kwargs['share'] = share
+    interface.launch(**launch_kwargs)
+if __name__ == '__main__':
+    # torch.cuda.set_per_process_memory_fraction(0.48)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--listen',
+        type=str,
+        default='127.0.0.1',
+        help='IP to listen on for connections to Gradio',
+    )
+    parser.add_argument(
+        '--username', type=str, default='', help='Username for authentication'
+    )
+    parser.add_argument(
+        '--password', type=str, default='', help='Password for authentication'
+    )
+    parser.add_argument(
+        '--server_port',
+        type=int,
+        default=0,
+        help='Port to run the server listener on',
+    )
+    parser.add_argument(
+        '--inbrowser', action='store_true', help='Open in browser'
+    )
+    parser.add_argument(
+        '--share', action='store_true', help='Share the gradio UI'
+    )
+    args = parser.parse_args()
+    UI(
+        username=args.username,
+        password=args.password,
+        inbrowser=args.inbrowser,
+        server_port=args.server_port,
+        share=args.share,
+        listen=args.listen,
+    )

kohya_ss_colab.ipynb ADDED Viewed

	@@ -0,0 +1,448 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "view-in-github"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/panguin6010/kohya_ss_google_colab/blob/master/kohya_ss_colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MvroZ9rJ1iqN"
+      },
+      "source": [
+        "# Kohya SS WebUI Colab Setup\n",
+        "\n",
+        "This Colab workbook sets up a Kohya SS instance on Colab and provides a link to access the Kohya WebUI on Gradio Live. Kohya SS is a Python library that provides Stable Diffusion-based models for image, text, and audio generation tasks. This Colab workbook provides a convenient way for users to run Kohya SS without needing to install anything on their local machine.\n",
+        "\n",
+        "This workbook was inspired by the work of [Spaceginner](https://github.com/Spaceginner)'s original Colab workbook and the [Kohya SS project](https://github.com/bmaltais/kohya_ss) by [bmaltais](https://github.com/bmaltais). The Colab workbook was coded by [panguin6010](https://github.com/panguin6010) \n",
+        "\n",
+        "\n",
+        "## Tutorials\n",
+        "\n",
+        "Before running this code, make sure you are familiar with using Colab workbooks and have a basic understanding of Kohya SS and its usage. You can find tutorials for these online. If you encounter any issues or have suggestions for improvement, feel free to contribute to the project.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DrAnm1um5vjh"
+      },
+      "source": [
+        "\n",
+        "\n",
+        "\n",
+        "---\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "vmoRnFQEqOeY",
+        "outputId": "09876c9a-d043-4881-d92f-6ed54313c390"
+      },
+      "outputs": [],
+      "source": [
+        "#@markdown #Step 1: Mounting Google Drive\n",
+        "\n",
+        "#@markdown The first step in setting up Kohya SS on Colab is to mount your Google Drive to the Colab notebook. This allows you to save and access files from your Google Drive in the Colab notebook.\n",
+        "\n",
+        "#@markdown To mount your Google Drive, run the following code block, which mounts your Google Drive to the /content/gdrive directory in the Colab notebook.\n",
+        "\n",
+        "\n",
+        "\n",
+        "from google.colab import drive\n",
+        "drive.mount('/content/gdrive')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mvQwnr4354BM"
+      },
+      "source": [
+        "\n",
+        "\n",
+        "---\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 49,
+          "referenced_widgets": [
+            "7ca7f6f727da46ac9a1149e69c16c81f",
+            "77e5e07552b641cf9c368fb3939cb1d1",
+            "235e01b92646444387ebd31ab945358e"
+          ]
+        },
+        "id": "jnhm7ycMrLWb",
+        "outputId": "63ba39ed-90c6-4b2d-f03e-61775587b083"
+      },
+      "outputs": [],
+      "source": [
+        "#@markdown #Kohya SS WebUI Installation\n",
+        "\n",
+        "#@markdown Now that your Google Drive is linked, we need to install the Kohya SS WebUI.\n",
+        "\n",
+        "#@markdown The code clones the [Kohya SS Google Colab](\"https://github.com/panguin6010/kohya_ss_google_colab\") repository and creates the necessary directories for Kohya SS to run. It then resets the git repository and pulls the latest changes. Finally, it displays a success message.\n",
+        "\n",
+        "#@markdown Note: If Google Drive is not connected, the code will use Colab storage instead.\n",
+        "\n",
+        "#@title\n",
+        "# Import necessary libraries\n",
+        "from IPython.display import clear_output\n",
+        "from IPython.utils import capture\n",
+        "from subprocess import getoutput\n",
+        "import ipywidgets as widgets\n",
+        "import sys\n",
+        "import fileinput\n",
+        "import os\n",
+        "import time\n",
+        "\n",
+        "# WebUI Installation\n",
+        "\n",
+        "# Check if Google Drive is connected\n",
+        "if not os.path.exists(\"/content/gdrive/MyDrive/\"):\n",
+        "    print(\"Gdrive not connected, using colab storage ...\")\n",
+        "    time.sleep(4)\n",
+        "    !mkdir -p /content/gdrive/MyDrive/\n",
+        "\n",
+        "# Clone the repository and create necessary directories\n",
+        "with capture.capture_output() as cap:\n",
+        "    def inf(msg, style, wdth):\n",
+        "        inf = widgets.Button(description=msg, disabled=True, button_style=style, layout=widgets.Layout(min_width=wdth))\n",
+        "        display(inf)\n",
+        "\n",
+        "    %mkdir -p /content/gdrive/MyDrive/sd\n",
+        "    %cd /content/gdrive/MyDrive/sd\n",
+        "    !git clone https://github.com/panguin6010/kohya_ss_google_colab kohya_ss_colab\n",
+        "    !mkdir -p /content/gdrive/MyDrive/sd/kohya_ss_colab/cache/huggingface\n",
+        "    !ln -s /content/gdrive/MyDrive/sd/kohya_ss_colab/cache/huggingface /root/.cache/\n",
+        "\n",
+        "# Reset the git repository and pull the latest changes\n",
+        "with capture.capture_output() as cap:\n",
+        "    %cd /content/gdrive/MyDrive/sd/kohya_ss_colab/\n",
+        "    !git reset --hard\n",
+        "    time.sleep(1)\n",
+        "\n",
+        "print(\"Updating the repository...\")\n",
+        "!git pull\n",
+        "\n",
+        "# Clear the output and display the success message\n",
+        "clear_output()\n",
+        "inf(\"✓ Done\", \"success\", \"50px\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8SrMhmFz7Lt4"
+      },
+      "source": [
+        "---"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 49,
+          "referenced_widgets": [
+            "54e929bcb37e4997a696d0becdecfd84",
+            "43fbca3abb04401296967f819680f94f",
+            "6d87b2c916394932b1a53382fe3cdb4e"
+          ]
+        },
+        "id": "yjvkHRlDtDmV",
+        "outputId": "06e1e873-b1ed-4403-c9a4-19ac1caa961b"
+      },
+      "outputs": [],
+      "source": [
+        "#@markdown #Requirements Installation\n",
+        "\n",
+        "#@markdown Now that we have downloaded the Kohya SS WebUI, we need to install the necessary requirements.\n",
+        "\n",
+        "# Print the status message\n",
+        "print(\"Installing requirements...\")\n",
+        "\n",
+        "# Change the working directory to the project folder\n",
+        "%cd /content/gdrive/MyDrive/sd/kohya_ss_colab/\n",
+        "\n",
+        "# Install the requirements\n",
+        "with capture.capture_output() as cap:\n",
+        "    # Uncomment the following line if you need to install specific versions of torch and torchvision\n",
+        "    # !pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116\n",
+        "    \n",
+        "    # Install the requirements from the requirements.txt file\n",
+        "    !pip install -r requirements.txt\n",
+        "\n",
+        "# Clear the output to keep the notebook clean\n",
+        "clear_output()\n",
+        "\n",
+        "# Print the success message\n",
+        "inf(\"✓ Done\", \"success\", \"50px\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "FLDvlHm1tYud"
+      },
+      "source": [
+        "\n",
+        "---\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "IzS3hvuTtTqW",
+        "outputId": "9e629e1f-c8eb-43a2-9639-2583937ba81a"
+      },
+      "outputs": [],
+      "source": [
+        "#@markdown # Start Kohya ss WebUI\n",
+        "\n",
+        "User = \"\" #@param {type:\"string\"}\n",
+        "Password = \"\" #@param {type:\"string\"}\n",
+        "\n",
+        "#@markdown - Adding a username and password is not necessary but it will improve the security of your Kohya instance.\n",
+        "#@markdown ______\n",
+        "#@markdown # Please click the link that concludes with ```gradio.live``` to access your instance\n",
+        "# Encourage users to contribute improvements\n",
+        "print(\"Please feel free to make any changes or improvements you think would enhance this setup. Your input and contributions are greatly appreciated!\")\n",
+        "# Check if the user has provided a username and password\n",
+        "if User and Password:\n",
+        "    # Run the Kohya GUI with the provided credentials\n",
+        "    !python /content/gdrive/MyDrive/sd/kohya_ss_colab/kohya_gui.py --username $User --password $Password --share \n",
+        "else:\n",
+        "    # Run the Kohya GUI without credentials\n",
+        "    !python /content/gdrive/MyDrive/sd/kohya_ss_colab/kohya_gui.py --share \n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "authorship_tag": "ABX9TyOZmOjfS55zOBmbTmRNOf3b",
+      "include_colab_link": true,
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "235e01b92646444387ebd31ab945358e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ButtonStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ButtonStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "button_color": null,
+            "font_weight": ""
+          }
+        },
+        "43fbca3abb04401296967f819680f94f": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": "50px",
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "54e929bcb37e4997a696d0becdecfd84": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ButtonModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ButtonModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ButtonView",
+            "button_style": "success",
+            "description": "✓ Done",
+            "disabled": true,
+            "icon": "",
+            "layout": "IPY_MODEL_43fbca3abb04401296967f819680f94f",
+            "style": "IPY_MODEL_6d87b2c916394932b1a53382fe3cdb4e",
+            "tooltip": ""
+          }
+        },
+        "6d87b2c916394932b1a53382fe3cdb4e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ButtonStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ButtonStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "button_color": null,
+            "font_weight": ""
+          }
+        },
+        "77e5e07552b641cf9c368fb3939cb1d1": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": "50px",
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "7ca7f6f727da46ac9a1149e69c16c81f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ButtonModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ButtonModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ButtonView",
+            "button_style": "success",
+            "description": "✓ Done",
+            "disabled": true,
+            "icon": "",
+            "layout": "IPY_MODEL_77e5e07552b641cf9c368fb3939cb1d1",
+            "style": "IPY_MODEL_235e01b92646444387ebd31ab945358e",
+            "tooltip": ""
+          }
+        }
+      }
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

library/__init__.py ADDED Viewed

File without changes

library/basic_caption_gui.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import gradio as gr
+from easygui import msgbox
+import subprocess
+from .common_gui import get_folder_path, add_pre_postfix, find_replace
+import os
+def caption_images(
+    caption_text,
+    images_dir,
+    overwrite,
+    caption_ext,
+    prefix,
+    postfix,
+    find_text,
+    replace_text,
+):
+    # Check for images_dir
+    if not images_dir:
+        msgbox('Image folder is missing...')
+        return
+    if not caption_ext:
+        msgbox('Please provide an extension for the caption files.')
+        return
+    if caption_text:
+        print(f'Captioning files in {images_dir} with {caption_text}...')
+        run_cmd = f'python "tools/caption.py"'
+        run_cmd += f' --caption_text="{caption_text}"'
+        if overwrite:
+            run_cmd += f' --overwrite'
+        if caption_ext:
+            run_cmd += f' --caption_file_ext="{caption_ext}"'
+        run_cmd += f' "{images_dir}"'
+        print(run_cmd)
+        # Run the command
+        if os.name == 'posix':
+            os.system(run_cmd)
+        else:
+            subprocess.run(run_cmd)
+    if overwrite:
+        if prefix or postfix:
+            # Add prefix and postfix
+            add_pre_postfix(
+                folder=images_dir,
+                caption_file_ext=caption_ext,
+                prefix=prefix,
+                postfix=postfix,
+            )
+        if find_text:
+            find_replace(
+                folder_path=images_dir,
+                caption_file_ext=caption_ext,
+                search_text=find_text,
+                replace_text=replace_text,
+            )
+    else:
+        if prefix or postfix:
+            msgbox(
+                'Could not modify caption files with requested change because the "Overwrite existing captions in folder" option is not selected...'
+            )
+    print('...captioning done')
+# Gradio UI
+def gradio_basic_caption_gui_tab():
+    with gr.Tab('Basic Captioning'):
+        gr.Markdown(
+            'This utility will allow the creation of simple caption files for each image in a folder.'
+        )
+        with gr.Row():
+            images_dir = gr.Textbox(
+                label='Image folder to caption',
+                placeholder='Directory containing the images to caption',
+                interactive=True,
+            )
+            folder_button = gr.Button('📂', elem_id='open_folder_small')
+            folder_button.click(
+                get_folder_path,
+                outputs=images_dir,
+                show_progress=False,
+            )
+            caption_ext = gr.Textbox(
+                label='Caption file extension',
+                placeholder='Extension for caption file. eg: .caption, .txt',
+                value='.txt',
+                interactive=True,
+            )
+            overwrite = gr.Checkbox(
+                label='Overwrite existing captions in folder',
+                interactive=True,
+                value=False,
+            )
+        with gr.Row():
+            prefix = gr.Textbox(
+                label='Prefix to add to caption',
+                placeholder='(Optional)',
+                interactive=True,
+            )
+            caption_text = gr.Textbox(
+                label='Caption text',
+                placeholder='Eg: , by some artist. Leave empty if you just want to add pre or postfix',
+                interactive=True,
+            )
+            postfix = gr.Textbox(
+                label='Postfix to add to caption',
+                placeholder='(Optional)',
+                interactive=True,
+            )
+        with gr.Row():
+            find_text = gr.Textbox(
+                label='Find text',
+                placeholder='Eg: , by some artist. Leave empty if you just want to add pre or postfix',
+                interactive=True,
+            )
+            replace_text = gr.Textbox(
+                label='Replacement text',
+                placeholder='Eg: , by some artist. Leave empty if you just want to replace with nothing',
+                interactive=True,
+            )
+            caption_button = gr.Button('Caption images')
+            caption_button.click(
+                caption_images,
+                inputs=[
+                    caption_text,
+                    images_dir,
+                    overwrite,
+                    caption_ext,
+                    prefix,
+                    postfix,
+                    find_text,
+                    replace_text,
+                ],
+                show_progress=False,
+            )

library/blip_caption_gui.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import gradio as gr
+from easygui import msgbox
+import subprocess
+import os
+from .common_gui import get_folder_path, add_pre_postfix
+PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
+def caption_images(
+    train_data_dir,
+    caption_file_ext,
+    batch_size,
+    num_beams,
+    top_p,
+    max_length,
+    min_length,
+    beam_search,
+    prefix,
+    postfix,
+):
+    # Check for caption_text_input
+    # if caption_text_input == "":
+    #     msgbox("Caption text is missing...")
+    #     return
+    # Check for images_dir_input
+    if train_data_dir == '':
+        msgbox('Image folder is missing...')
+        return
+    if caption_file_ext == '':
+        msgbox('Please provide an extension for the caption files.')
+        return
+    print(f'Captioning files in {train_data_dir}...')
+    run_cmd = f'{PYTHON} "finetune/make_captions.py"'
+    run_cmd += f' --batch_size="{int(batch_size)}"'
+    run_cmd += f' --num_beams="{int(num_beams)}"'
+    run_cmd += f' --top_p="{top_p}"'
+    run_cmd += f' --max_length="{int(max_length)}"'
+    run_cmd += f' --min_length="{int(min_length)}"'
+    if beam_search:
+        run_cmd += f' --beam_search'
+    if caption_file_ext != '':
+        run_cmd += f' --caption_extension="{caption_file_ext}"'
+    run_cmd += f' "{train_data_dir}"'
+    run_cmd += f' --caption_weights="https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"'
+    print(run_cmd)
+    # Run the command
+    if os.name == 'posix':
+        os.system(run_cmd)
+    else:
+        subprocess.run(run_cmd)
+    # Add prefix and postfix
+    add_pre_postfix(
+        folder=train_data_dir,
+        caption_file_ext=caption_file_ext,
+        prefix=prefix,
+        postfix=postfix,
+    )
+    print('...captioning done')
+###
+# Gradio UI
+###
+def gradio_blip_caption_gui_tab():
+    with gr.Tab('BLIP Captioning'):
+        gr.Markdown(
+            'This utility will use BLIP to caption files for each images in a folder.'
+        )
+        with gr.Row():
+            train_data_dir = gr.Textbox(
+                label='Image folder to caption',
+                placeholder='Directory containing the images to caption',
+                interactive=True,
+            )
+            button_train_data_dir_input = gr.Button(
+                '📂', elem_id='open_folder_small'
+            )
+            button_train_data_dir_input.click(
+                get_folder_path,
+                outputs=train_data_dir,
+                show_progress=False,
+            )
+        with gr.Row():
+            caption_file_ext = gr.Textbox(
+                label='Caption file extension',
+                placeholder='Extention for caption file. eg: .caption, .txt',
+                value='.txt',
+                interactive=True,
+            )
+            prefix = gr.Textbox(
+                label='Prefix to add to BLIP caption',
+                placeholder='(Optional)',
+                interactive=True,
+            )
+            postfix = gr.Textbox(
+                label='Postfix to add to BLIP caption',
+                placeholder='(Optional)',
+                interactive=True,
+            )
+            batch_size = gr.Number(
+                value=1, label='Batch size', interactive=True
+            )
+        with gr.Row():
+            beam_search = gr.Checkbox(
+                label='Use beam search', interactive=True, value=True
+            )
+            num_beams = gr.Number(
+                value=1, label='Number of beams', interactive=True
+            )
+            top_p = gr.Number(value=0.9, label='Top p', interactive=True)
+            max_length = gr.Number(
+                value=75, label='Max length', interactive=True
+            )
+            min_length = gr.Number(
+                value=5, label='Min length', interactive=True
+            )
+        caption_button = gr.Button('Caption images')
+        caption_button.click(
+            caption_images,
+            inputs=[
+                train_data_dir,
+                caption_file_ext,
+                batch_size,
+                num_beams,
+                top_p,
+                max_length,
+                min_length,
+                beam_search,
+                prefix,
+                postfix,
+            ],
+            show_progress=False,
+        )

library/common_gui.py ADDED Viewed

	@@ -0,0 +1,978 @@

+from tkinter import filedialog, Tk
+from easygui import msgbox
+import os
+import gradio as gr
+import easygui
+import shutil
+folder_symbol = '\U0001f4c2'  # 📂
+refresh_symbol = '\U0001f504'  # 🔄
+save_style_symbol = '\U0001f4be'  # 💾
+document_symbol = '\U0001F4C4'   # 📄
+# define a list of substrings to search for v2 base models
+V2_BASE_MODELS = [
+    'stabilityai/stable-diffusion-2-1-base',
+    'stabilityai/stable-diffusion-2-base',
+]
+# define a list of substrings to search for v_parameterization models
+V_PARAMETERIZATION_MODELS = [
+    'stabilityai/stable-diffusion-2-1',
+    'stabilityai/stable-diffusion-2',
+]
+# define a list of substrings to v1.x models
+V1_MODELS = [
+    'CompVis/stable-diffusion-v1-4',
+    'runwayml/stable-diffusion-v1-5',
+]
+# define a list of substrings to search for
+ALL_PRESET_MODELS = V2_BASE_MODELS + V_PARAMETERIZATION_MODELS + V1_MODELS
+FILE_ENV_EXCLUSION = ['COLAB_GPU', 'RUNPOD_POD_ID']
+def check_if_model_exist(output_name, output_dir, save_model_as):
+    if save_model_as in ['diffusers', 'diffusers_safetendors']:
+        ckpt_folder = os.path.join(output_dir, output_name)
+        if os.path.isdir(ckpt_folder):
+            msg = f'A diffuser model with the same name {ckpt_folder} already exists. Do you want to overwrite it?'
+            if not easygui.ynbox(msg, 'Overwrite Existing Model?'):
+                print(
+                    'Aborting training due to existing model with same name...'
+                )
+                return True
+    elif save_model_as in ['ckpt', 'safetensors']:
+        ckpt_file = os.path.join(output_dir, output_name + '.' + save_model_as)
+        if os.path.isfile(ckpt_file):
+            msg = f'A model with the same file name {ckpt_file} already exists. Do you want to overwrite it?'
+            if not easygui.ynbox(msg, 'Overwrite Existing Model?'):
+                print(
+                    'Aborting training due to existing model with same name...'
+                )
+                return True
+    else:
+        print(
+            'Can\'t verify if existing model exist when save model is set a "same as source model", continuing to train model...'
+        )
+        return False
+    return False
+def update_my_data(my_data):
+    # Update the optimizer based on the use_8bit_adam flag
+    use_8bit_adam = my_data.get('use_8bit_adam', False)
+    my_data.setdefault('optimizer', 'AdamW8bit' if use_8bit_adam else 'AdamW')
+    # Update model_list to custom if empty or pretrained_model_name_or_path is not a preset model
+    model_list = my_data.get('model_list', [])
+    pretrained_model_name_or_path = my_data.get('pretrained_model_name_or_path', '')
+    if not model_list or pretrained_model_name_or_path not in ALL_PRESET_MODELS:
+        my_data['model_list'] = 'custom'
+    # Convert epoch and save_every_n_epochs values to int if they are strings
+    for key in ['epoch', 'save_every_n_epochs']:
+        value = my_data.get(key, -1)
+        if isinstance(value, str) and value.isdigit():
+            my_data[key] = int(value)
+        elif not value:
+            my_data[key] = -1
+    # Update LoRA_type if it is set to LoCon
+    if my_data.get('LoRA_type', 'Standard') == 'LoCon':
+        my_data['LoRA_type'] = 'LyCORIS/LoCon'
+    # Update model save choices due to changes for LoRA and TI training
+    if (
+        (my_data.get('LoRA_type') or my_data.get('num_vectors_per_token'))
+        and my_data.get('save_model_as') not in ['safetensors', 'ckpt']
+    ):
+        message = (
+            'Updating save_model_as to safetensors because the current value in the config file is no longer applicable to {}'
+        )
+        if my_data.get('LoRA_type'):
+            print(message.format('LoRA'))
+        if my_data.get('num_vectors_per_token'):
+            print(message.format('TI'))
+        my_data['save_model_as'] = 'safetensors'
+    return my_data
+def get_dir_and_file(file_path):
+    dir_path, file_name = os.path.split(file_path)
+    return (dir_path, file_name)
+# def has_ext_files(directory, extension):
+#     # Iterate through all the files in the directory
+#     for file in os.listdir(directory):
+#         # If the file name ends with extension, return True
+#         if file.endswith(extension):
+#             return True
+#     # If no extension files were found, return False
+#     return False
+def get_file_path(
+    file_path='', default_extension='.json', extension_name='Config files'
+):
+    if not any(var in os.environ for var in FILE_ENV_EXCLUSION):
+        current_file_path = file_path
+        # print(f'current file path: {current_file_path}')
+        initial_dir, initial_file = get_dir_and_file(file_path)
+        # Create a hidden Tkinter root window
+        root = Tk()
+        root.wm_attributes('-topmost', 1)
+        root.withdraw()
+        # Show the open file dialog and get the selected file path
+        file_path = filedialog.askopenfilename(
+            filetypes=(
+                (extension_name, f'*{default_extension}'),
+                ('All files', '*.*'),
+            ),
+            defaultextension=default_extension,
+            initialfile=initial_file,
+            initialdir=initial_dir,
+        )
+        # Destroy the hidden root window
+        root.destroy()
+        # If no file is selected, use the current file path
+        if not file_path:
+            file_path = current_file_path
+        current_file_path = file_path
+        # print(f'current file path: {current_file_path}')
+    return file_path
+def get_any_file_path(file_path=''):
+    if not any(var in os.environ for var in FILE_ENV_EXCLUSION):
+        current_file_path = file_path
+        # print(f'current file path: {current_file_path}')
+        initial_dir, initial_file = get_dir_and_file(file_path)
+        root = Tk()
+        root.wm_attributes('-topmost', 1)
+        root.withdraw()
+        file_path = filedialog.askopenfilename(
+            initialdir=initial_dir,
+            initialfile=initial_file,
+        )
+        root.destroy()
+        if file_path == '':
+            file_path = current_file_path
+    return file_path
+def remove_doublequote(file_path):
+    if file_path != None:
+        file_path = file_path.replace('"', '')
+    return file_path
+# def set_legacy_8bitadam(optimizer, use_8bit_adam):
+#     if optimizer == 'AdamW8bit':
+#         # use_8bit_adam = True
+#         return gr.Dropdown.update(value=optimizer), gr.Checkbox.update(
+#             value=True, interactive=False, visible=True
+#         )
+#     else:
+#         # use_8bit_adam = False
+#         return gr.Dropdown.update(value=optimizer), gr.Checkbox.update(
+#             value=False, interactive=False, visible=True
+#         )
+def get_folder_path(folder_path=''):
+    if not any(var in os.environ for var in FILE_ENV_EXCLUSION):
+        current_folder_path = folder_path
+        initial_dir, initial_file = get_dir_and_file(folder_path)
+        root = Tk()
+        root.wm_attributes('-topmost', 1)
+        root.withdraw()
+        folder_path = filedialog.askdirectory(initialdir=initial_dir)
+        root.destroy()
+        if folder_path == '':
+            folder_path = current_folder_path
+    return folder_path
+def get_saveasfile_path(
+    file_path='', defaultextension='.json', extension_name='Config files'
+):
+    if not any(var in os.environ for var in FILE_ENV_EXCLUSION):
+        current_file_path = file_path
+        # print(f'current file path: {current_file_path}')
+        initial_dir, initial_file = get_dir_and_file(file_path)
+        root = Tk()
+        root.wm_attributes('-topmost', 1)
+        root.withdraw()
+        save_file_path = filedialog.asksaveasfile(
+            filetypes=(
+                (f'{extension_name}', f'{defaultextension}'),
+                ('All files', '*'),
+            ),
+            defaultextension=defaultextension,
+            initialdir=initial_dir,
+            initialfile=initial_file,
+        )
+        root.destroy()
+        # print(save_file_path)
+        if save_file_path == None:
+            file_path = current_file_path
+        else:
+            print(save_file_path.name)
+            file_path = save_file_path.name
+        # print(file_path)
+    return file_path
+def get_saveasfilename_path(
+    file_path='', extensions='*', extension_name='Config files'
+):
+    if not any(var in os.environ for var in FILE_ENV_EXCLUSION):
+        current_file_path = file_path
+        # print(f'current file path: {current_file_path}')
+        initial_dir, initial_file = get_dir_and_file(file_path)
+        root = Tk()
+        root.wm_attributes('-topmost', 1)
+        root.withdraw()
+        save_file_path = filedialog.asksaveasfilename(
+            filetypes=((f'{extension_name}', f'{extensions}'), ('All files', '*')),
+            defaultextension=extensions,
+            initialdir=initial_dir,
+            initialfile=initial_file,
+        )
+        root.destroy()
+        if save_file_path == '':
+            file_path = current_file_path
+        else:
+            # print(save_file_path)
+            file_path = save_file_path
+    return file_path
+def add_pre_postfix(
+    folder: str = '',
+    prefix: str = '',
+    postfix: str = '',
+    caption_file_ext: str = '.caption',
+) -> None:
+    """
+    Add prefix and/or postfix to the content of caption files within a folder.
+    If no caption files are found, create one with the requested prefix and/or postfix.
+    Args:
+        folder (str): Path to the folder containing caption files.
+        prefix (str, optional): Prefix to add to the content of the caption files.
+        postfix (str, optional): Postfix to add to the content of the caption files.
+        caption_file_ext (str, optional): Extension of the caption files.
+    """
+    if prefix == '' and postfix == '':
+        return
+    image_extensions = ('.jpg', '.jpeg', '.png', '.webp')
+    image_files = [
+        f for f in os.listdir(folder) if f.lower().endswith(image_extensions)
+    ]
+    for image_file in image_files:
+        caption_file_name = os.path.splitext(image_file)[0] + caption_file_ext
+        caption_file_path = os.path.join(folder, caption_file_name)
+        if not os.path.exists(caption_file_path):
+            with open(caption_file_path, 'w') as f:
+                separator = ' ' if prefix and postfix else ''
+                f.write(f'{prefix}{separator}{postfix}')
+        else:
+            with open(caption_file_path, 'r+') as f:
+                content = f.read()
+                content = content.rstrip()
+                f.seek(0, 0)
+                prefix_separator = ' ' if prefix else ''
+                postfix_separator = ' ' if postfix else ''
+                f.write(
+                    f'{prefix}{prefix_separator}{content}{postfix_separator}{postfix}'
+                )
+def has_ext_files(folder_path: str, file_extension: str) -> bool:
+    """
+    Check if there are any files with the specified extension in the given folder.
+    Args:
+        folder_path (str): Path to the folder containing files.
+        file_extension (str): Extension of the files to look for.
+    Returns:
+        bool: True if files with the specified extension are found, False otherwise.
+    """
+    for file in os.listdir(folder_path):
+        if file.endswith(file_extension):
+            return True
+    return False
+def find_replace(
+    folder_path: str = '',
+    caption_file_ext: str = '.caption',
+    search_text: str = '',
+    replace_text: str = '',
+) -> None:
+    """
+    Find and replace text in caption files within a folder.
+    Args:
+        folder_path (str, optional): Path to the folder containing caption files.
+        caption_file_ext (str, optional): Extension of the caption files.
+        search_text (str, optional): Text to search for in the caption files.
+        replace_text (str, optional): Text to replace the search text with.
+    """
+    print('Running caption find/replace')
+    if not has_ext_files(folder_path, caption_file_ext):
+        msgbox(
+            f'No files with extension {caption_file_ext} were found in {folder_path}...'
+        )
+        return
+    if search_text == '':
+        return
+    caption_files = [
+        f for f in os.listdir(folder_path) if f.endswith(caption_file_ext)
+    ]
+    for caption_file in caption_files:
+        with open(
+            os.path.join(folder_path, caption_file), 'r', errors='ignore'
+        ) as f:
+            content = f.read()
+        content = content.replace(search_text, replace_text)
+        with open(os.path.join(folder_path, caption_file), 'w') as f:
+            f.write(content)
+def color_aug_changed(color_aug):
+    if color_aug:
+        msgbox(
+            'Disabling "Cache latent" because "Color augmentation" has been selected...'
+        )
+        return gr.Checkbox.update(value=False, interactive=False)
+    else:
+        return gr.Checkbox.update(value=True, interactive=True)
+def save_inference_file(output_dir, v2, v_parameterization, output_name):
+    # List all files in the directory
+    files = os.listdir(output_dir)
+    # Iterate over the list of files
+    for file in files:
+        # Check if the file starts with the value of output_name
+        if file.startswith(output_name):
+            # Check if it is a file or a directory
+            if os.path.isfile(os.path.join(output_dir, file)):
+                # Split the file name and extension
+                file_name, ext = os.path.splitext(file)
+                # Copy the v2-inference-v.yaml file to the current file, with a .yaml extension
+                if v2 and v_parameterization:
+                    print(
+                        f'Saving v2-inference-v.yaml as {output_dir}/{file_name}.yaml'
+                    )
+                    shutil.copy(
+                        f'./v2_inference/v2-inference-v.yaml',
+                        f'{output_dir}/{file_name}.yaml',
+                    )
+                elif v2:
+                    print(
+                        f'Saving v2-inference.yaml as {output_dir}/{file_name}.yaml'
+                    )
+                    shutil.copy(
+                        f'./v2_inference/v2-inference.yaml',
+                        f'{output_dir}/{file_name}.yaml',
+                    )
+def set_pretrained_model_name_or_path_input(
+    model_list, pretrained_model_name_or_path, v2, v_parameterization
+):
+    # check if $v2 and $v_parameterization are empty and if $pretrained_model_name_or_path contains any of the substrings in the v2 list
+    if str(model_list) in V2_BASE_MODELS:
+        print('SD v2 model detected. Setting --v2 parameter')
+        v2 = True
+        v_parameterization = False
+        pretrained_model_name_or_path = str(model_list)
+    # check if $v2 and $v_parameterization are empty and if $pretrained_model_name_or_path contains any of the substrings in the v_parameterization list
+    if str(model_list) in V_PARAMETERIZATION_MODELS:
+        print(
+            'SD v2 v_parameterization detected. Setting --v2 parameter and --v_parameterization'
+        )
+        v2 = True
+        v_parameterization = True
+        pretrained_model_name_or_path = str(model_list)
+    if str(model_list) in V1_MODELS:
+        v2 = False
+        v_parameterization = False
+        pretrained_model_name_or_path = str(model_list)
+    if model_list == 'custom':
+        if (
+            str(pretrained_model_name_or_path) in V1_MODELS
+            or str(pretrained_model_name_or_path) in V2_BASE_MODELS
+            or str(pretrained_model_name_or_path) in V_PARAMETERIZATION_MODELS
+        ):
+            pretrained_model_name_or_path = ''
+            v2 = False
+            v_parameterization = False
+    return model_list, pretrained_model_name_or_path, v2, v_parameterization
+def set_v2_checkbox(model_list, v2, v_parameterization):
+    # check if $v2 and $v_parameterization are empty and if $pretrained_model_name_or_path contains any of the substrings in the v2 list
+    if str(model_list) in V2_BASE_MODELS:
+        v2 = True
+        v_parameterization = False
+    # check if $v2 and $v_parameterization are empty and if $pretrained_model_name_or_path contains any of the substrings in the v_parameterization list
+    if str(model_list) in V_PARAMETERIZATION_MODELS:
+        v2 = True
+        v_parameterization = True
+    if str(model_list) in V1_MODELS:
+        v2 = False
+        v_parameterization = False
+    return v2, v_parameterization
+def set_model_list(
+    model_list,
+    pretrained_model_name_or_path,
+    v2,
+    v_parameterization,
+):
+    if not pretrained_model_name_or_path in ALL_PRESET_MODELS:
+        model_list = 'custom'
+    else:
+        model_list = pretrained_model_name_or_path
+    return model_list, v2, v_parameterization
+###
+### Gradio common GUI section
+###
+def gradio_config():
+    with gr.Accordion('Configuration file', open=False):
+        with gr.Row():
+            button_open_config = gr.Button('Open 📂', elem_id='open_folder')
+            button_save_config = gr.Button('Save 💾', elem_id='open_folder')
+            button_save_as_config = gr.Button(
+                'Save as... 💾', elem_id='open_folder'
+            )
+            config_file_name = gr.Textbox(
+                label='',
+                placeholder="type the configuration file path or use the 'Open' button above to select it...",
+                interactive=True,
+            )
+            button_load_config = gr.Button('Load 💾', elem_id='open_folder')
+            config_file_name.change(
+                remove_doublequote,
+                inputs=[config_file_name],
+                outputs=[config_file_name],
+            )
+    return (
+        button_open_config,
+        button_save_config,
+        button_save_as_config,
+        config_file_name,
+        button_load_config,
+    )
+def get_pretrained_model_name_or_path_file(
+    model_list, pretrained_model_name_or_path
+):
+    pretrained_model_name_or_path = get_any_file_path(
+        pretrained_model_name_or_path
+    )
+    set_model_list(model_list, pretrained_model_name_or_path)
+def gradio_source_model(save_model_as_choices = [
+                    'same as source model',
+                    'ckpt',
+                    'diffusers',
+                    'diffusers_safetensors',
+                    'safetensors',
+                ]):
+    with gr.Tab('Source model'):
+        # Define the input elements
+        with gr.Row():
+            pretrained_model_name_or_path = gr.Textbox(
+                label='Pretrained model name or path',
+                placeholder='enter the path to custom model or name of pretrained model',
+                value='runwayml/stable-diffusion-v1-5',
+            )
+            pretrained_model_name_or_path_file = gr.Button(
+                document_symbol, elem_id='open_folder_small'
+            )
+            pretrained_model_name_or_path_file.click(
+                get_any_file_path,
+                inputs=pretrained_model_name_or_path,
+                outputs=pretrained_model_name_or_path,
+                show_progress=False,
+            )
+            pretrained_model_name_or_path_folder = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            pretrained_model_name_or_path_folder.click(
+                get_folder_path,
+                inputs=pretrained_model_name_or_path,
+                outputs=pretrained_model_name_or_path,
+                show_progress=False,
+            )
+            model_list = gr.Dropdown(
+                label='Model Quick Pick',
+                choices=[
+                    'custom',
+                    'stabilityai/stable-diffusion-2-1-base',
+                    'stabilityai/stable-diffusion-2-base',
+                    'stabilityai/stable-diffusion-2-1',
+                    'stabilityai/stable-diffusion-2',
+                    'runwayml/stable-diffusion-v1-5',
+                    'CompVis/stable-diffusion-v1-4',
+                ],
+                value='runwayml/stable-diffusion-v1-5',
+            )
+            save_model_as = gr.Dropdown(
+                label='Save trained model as',
+                choices=save_model_as_choices,
+                value='safetensors',
+            )
+        with gr.Row():
+            v2 = gr.Checkbox(label='v2', value=False)
+            v_parameterization = gr.Checkbox(
+                label='v_parameterization', value=False
+            )
+            v2.change(
+                set_v2_checkbox,
+                inputs=[model_list, v2, v_parameterization],
+                outputs=[v2, v_parameterization],
+                show_progress=False,
+            )
+            v_parameterization.change(
+                set_v2_checkbox,
+                inputs=[model_list, v2, v_parameterization],
+                outputs=[v2, v_parameterization],
+                show_progress=False,
+            )
+        model_list.change(
+            set_pretrained_model_name_or_path_input,
+            inputs=[
+                model_list,
+                pretrained_model_name_or_path,
+                v2,
+                v_parameterization,
+            ],
+            outputs=[
+                model_list,
+                pretrained_model_name_or_path,
+                v2,
+                v_parameterization,
+            ],
+            show_progress=False,
+        )
+        # Update the model list and parameters when user click outside the button or field
+        pretrained_model_name_or_path.change(
+            set_model_list,
+            inputs=[
+                model_list,
+                pretrained_model_name_or_path,
+                v2,
+                v_parameterization,
+            ],
+            outputs=[
+                model_list,
+                v2,
+                v_parameterization,
+            ],
+            show_progress=False,
+        )
+    return (
+        pretrained_model_name_or_path,
+        v2,
+        v_parameterization,
+        save_model_as,
+        model_list,
+    )
+def gradio_training(
+    learning_rate_value='1e-6',
+    lr_scheduler_value='constant',
+    lr_warmup_value='0',
+):
+    with gr.Row():
+        train_batch_size = gr.Slider(
+            minimum=1,
+            maximum=64,
+            label='Train batch size',
+            value=1,
+            step=1,
+        )
+        epoch = gr.Number(label='Epoch', value=1, precision=0)
+        save_every_n_epochs = gr.Number(
+            label='Save every N epochs', value=1, precision=0
+        )
+        caption_extension = gr.Textbox(
+            label='Caption Extension',
+            placeholder='(Optional) Extension for caption files. default: .caption',
+        )
+    with gr.Row():
+        mixed_precision = gr.Dropdown(
+            label='Mixed precision',
+            choices=[
+                'no',
+                'fp16',
+                'bf16',
+            ],
+            value='fp16',
+        )
+        save_precision = gr.Dropdown(
+            label='Save precision',
+            choices=[
+                'float',
+                'fp16',
+                'bf16',
+            ],
+            value='fp16',
+        )
+        num_cpu_threads_per_process = gr.Slider(
+            minimum=1,
+            maximum=os.cpu_count(),
+            step=1,
+            label='Number of CPU threads per core',
+            value=2,
+        )
+        seed = gr.Textbox(label='Seed', placeholder='(Optional) eg:1234')
+        cache_latents = gr.Checkbox(label='Cache latent', value=True)
+    with gr.Row():
+        learning_rate = gr.Textbox(
+            label='Learning rate', value=learning_rate_value
+        )
+        lr_scheduler = gr.Dropdown(
+            label='LR Scheduler',
+            choices=[
+                'adafactor',
+                'constant',
+                'constant_with_warmup',
+                'cosine',
+                'cosine_with_restarts',
+                'linear',
+                'polynomial',
+            ],
+            value=lr_scheduler_value,
+        )
+        lr_warmup = gr.Textbox(
+            label='LR warmup (% of steps)', value=lr_warmup_value
+        )
+        optimizer = gr.Dropdown(
+            label='Optimizer',
+            choices=[
+                'AdamW',
+                'AdamW8bit',
+                'Adafactor',
+                'DAdaptation',
+                'Lion',
+                'SGDNesterov',
+                'SGDNesterov8bit',
+            ],
+            value='AdamW8bit',
+            interactive=True,
+        )
+    with gr.Row():
+        optimizer_args = gr.Textbox(
+            label='Optimizer extra arguments',
+            placeholder='(Optional) eg: relative_step=True scale_parameter=True warmup_init=True',
+        )
+    return (
+        learning_rate,
+        lr_scheduler,
+        lr_warmup,
+        train_batch_size,
+        epoch,
+        save_every_n_epochs,
+        mixed_precision,
+        save_precision,
+        num_cpu_threads_per_process,
+        seed,
+        caption_extension,
+        cache_latents,
+        optimizer,
+        optimizer_args,
+    )
+def run_cmd_training(**kwargs):
+    options = [
+        f' --learning_rate="{kwargs.get("learning_rate", "")}"'
+        if kwargs.get('learning_rate')
+        else '',
+        f' --lr_scheduler="{kwargs.get("lr_scheduler", "")}"'
+        if kwargs.get('lr_scheduler')
+        else '',
+        f' --lr_warmup_steps="{kwargs.get("lr_warmup_steps", "")}"'
+        if kwargs.get('lr_warmup_steps')
+        else '',
+        f' --train_batch_size="{kwargs.get("train_batch_size", "")}"'
+        if kwargs.get('train_batch_size')
+        else '',
+        f' --max_train_steps="{kwargs.get("max_train_steps", "")}"'
+        if kwargs.get('max_train_steps')
+        else '',
+        f' --save_every_n_epochs="{int(kwargs.get("save_every_n_epochs", 1))}"'
+        if int(kwargs.get('save_every_n_epochs'))
+        else '',
+        f' --mixed_precision="{kwargs.get("mixed_precision", "")}"'
+        if kwargs.get('mixed_precision')
+        else '',
+        f' --save_precision="{kwargs.get("save_precision", "")}"'
+        if kwargs.get('save_precision')
+        else '',
+        f' --seed="{kwargs.get("seed", "")}"'
+        if kwargs.get('seed') != ''
+        else '',
+        f' --caption_extension="{kwargs.get("caption_extension", "")}"'
+        if kwargs.get('caption_extension')
+        else '',
+        ' --cache_latents' if kwargs.get('cache_latents') else '',
+        # ' --use_lion_optimizer' if kwargs.get('optimizer') == 'Lion' else '',
+        f' --optimizer_type="{kwargs.get("optimizer", "AdamW")}"',
+        f' --optimizer_args {kwargs.get("optimizer_args", "")}'
+        if not kwargs.get('optimizer_args') == ''
+        else '',
+    ]
+    run_cmd = ''.join(options)
+    return run_cmd
+def gradio_advanced_training():
+    with gr.Row():
+        additional_parameters = gr.Textbox(
+            label='Additional parameters',
+            placeholder='(Optional) Use to provide additional parameters not handled by the GUI. Eg: --some_parameters "value"',
+        )
+    with gr.Row():
+        keep_tokens = gr.Slider(
+            label='Keep n tokens', value='0', minimum=0, maximum=32, step=1
+        )
+        clip_skip = gr.Slider(
+            label='Clip skip', value='1', minimum=1, maximum=12, step=1
+        )
+        max_token_length = gr.Dropdown(
+            label='Max Token Length',
+            choices=[
+                '75',
+                '150',
+                '225',
+            ],
+            value='75',
+        )
+        full_fp16 = gr.Checkbox(
+            label='Full fp16 training (experimental)', value=False
+        )
+    with gr.Row():
+        gradient_checkpointing = gr.Checkbox(
+            label='Gradient checkpointing', value=False
+        )
+        shuffle_caption = gr.Checkbox(label='Shuffle caption', value=False)
+        persistent_data_loader_workers = gr.Checkbox(
+            label='Persistent data loader', value=False
+        )
+        mem_eff_attn = gr.Checkbox(
+            label='Memory efficient attention', value=False
+        )
+    with gr.Row():
+        # This use_8bit_adam element should be removed in a future release as it is no longer used
+        # use_8bit_adam = gr.Checkbox(
+        #     label='Use 8bit adam', value=False, visible=False
+        # )
+        xformers = gr.Checkbox(label='Use xformers', value=True)
+        color_aug = gr.Checkbox(label='Color augmentation', value=False)
+        flip_aug = gr.Checkbox(label='Flip augmentation', value=False)
+        min_snr_gamma = gr.Slider(label='Min SNR gamma', value = 0, minimum=0, maximum=20, step=1)
+    with gr.Row():
+        bucket_no_upscale = gr.Checkbox(
+            label="Don't upscale bucket resolution", value=True
+        )
+        bucket_reso_steps = gr.Number(
+            label='Bucket resolution steps', value=64
+        )
+        random_crop = gr.Checkbox(
+            label='Random crop instead of center crop', value=False
+        )
+        noise_offset = gr.Textbox(
+            label='Noise offset (0 - 1)', placeholder='(Oprional) eg: 0.1'
+        )
+    with gr.Row():
+        caption_dropout_every_n_epochs = gr.Number(
+            label='Dropout caption every n epochs', value=0
+        )
+        caption_dropout_rate = gr.Slider(
+            label='Rate of caption dropout', value=0, minimum=0, maximum=1
+        )
+        vae_batch_size = gr.Slider(
+            label='VAE batch size',
+            minimum=0,
+            maximum=32,
+            value=0,
+            step=1
+        )
+    with gr.Row():
+        save_state = gr.Checkbox(label='Save training state', value=False)
+        resume = gr.Textbox(
+            label='Resume from saved training state',
+            placeholder='path to "last-state" state folder to resume from',
+        )
+        resume_button = gr.Button('📂', elem_id='open_folder_small')
+        resume_button.click(
+            get_folder_path,
+            outputs=resume,
+            show_progress=False,
+        )
+        max_train_epochs = gr.Textbox(
+            label='Max train epoch',
+            placeholder='(Optional) Override number of epoch',
+        )
+        max_data_loader_n_workers = gr.Textbox(
+            label='Max num workers for DataLoader',
+            placeholder='(Optional) Override number of epoch. Default: 8',
+            value="0",
+        )
+    return (
+        # use_8bit_adam,
+        xformers,
+        full_fp16,
+        gradient_checkpointing,
+        shuffle_caption,
+        color_aug,
+        flip_aug,
+        clip_skip,
+        mem_eff_attn,
+        save_state,
+        resume,
+        max_token_length,
+        max_train_epochs,
+        max_data_loader_n_workers,
+        keep_tokens,
+        persistent_data_loader_workers,
+        bucket_no_upscale,
+        random_crop,
+        bucket_reso_steps,
+        caption_dropout_every_n_epochs,
+        caption_dropout_rate,
+        noise_offset,
+        additional_parameters,
+        vae_batch_size,
+        min_snr_gamma,
+    )
+def run_cmd_advanced_training(**kwargs):
+    options = [
+        f' --max_train_epochs="{kwargs.get("max_train_epochs", "")}"'
+        if kwargs.get('max_train_epochs')
+        else '',
+        f' --max_data_loader_n_workers="{kwargs.get("max_data_loader_n_workers", "")}"'
+        if kwargs.get('max_data_loader_n_workers')
+        else '',
+        f' --max_token_length={kwargs.get("max_token_length", "")}'
+        if int(kwargs.get('max_token_length', 75)) > 75
+        else '',
+        f' --clip_skip={kwargs.get("clip_skip", "")}'
+        if int(kwargs.get('clip_skip', 1)) > 1
+        else '',
+        f' --resume="{kwargs.get("resume", "")}"'
+        if kwargs.get('resume')
+        else '',
+        f' --keep_tokens="{kwargs.get("keep_tokens", "")}"'
+        if int(kwargs.get('keep_tokens', 0)) > 0
+        else '',
+        f' --caption_dropout_every_n_epochs="{int(kwargs.get("caption_dropout_every_n_epochs", 0))}"'
+        if int(kwargs.get('caption_dropout_every_n_epochs', 0)) > 0
+        else '',
+        f' --caption_dropout_every_n_epochs="{int(kwargs.get("caption_dropout_every_n_epochs", 0))}"'
+        if int(kwargs.get('caption_dropout_every_n_epochs', 0)) > 0
+        else '',
+        f' --vae_batch_size="{kwargs.get("vae_batch_size", 0)}"'
+        if int(kwargs.get('vae_batch_size', 0)) > 0
+        else '',
+        f' --bucket_reso_steps={int(kwargs.get("bucket_reso_steps", 1))}'
+        if int(kwargs.get('bucket_reso_steps', 64)) >= 1
+        else '',
+        f' --min_snr_gamma={int(kwargs.get("min_snr_gamma", 0))}'
+        if int(kwargs.get('min_snr_gamma', 0)) >= 1
+        else '',
+        ' --save_state' if kwargs.get('save_state') else '',
+        ' --mem_eff_attn' if kwargs.get('mem_eff_attn') else '',
+        ' --color_aug' if kwargs.get('color_aug') else '',
+        ' --flip_aug' if kwargs.get('flip_aug') else '',
+        ' --shuffle_caption' if kwargs.get('shuffle_caption') else '',
+        ' --gradient_checkpointing' if kwargs.get('gradient_checkpointing')
+        else '',
+        ' --full_fp16' if kwargs.get('full_fp16') else '',
+        ' --xformers' if kwargs.get('xformers') else '',
+        # ' --use_8bit_adam' if kwargs.get('use_8bit_adam') else '',
+        ' --persistent_data_loader_workers'
+        if kwargs.get('persistent_data_loader_workers')
+        else '',
+        ' --bucket_no_upscale' if kwargs.get('bucket_no_upscale') else '',
+        ' --random_crop' if kwargs.get('random_crop') else '',
+        f' --noise_offset={float(kwargs.get("noise_offset", 0))}'
+        if not kwargs.get('noise_offset', '') == ''
+        else '',
+        f' {kwargs.get("additional_parameters", "")}',
+    ]
+    run_cmd = ''.join(options)
+    return run_cmd

library/config_util.py ADDED Viewed

	@@ -0,0 +1,536 @@

+import argparse
+from dataclasses import (
+  asdict,
+  dataclass,
+)
+import functools
+import random
+from textwrap import dedent, indent
+import json
+from pathlib import Path
+# from toolz import curry
+from typing import (
+  List,
+  Optional,
+  Sequence,
+  Tuple,
+  Union,
+)
+import toml
+import voluptuous
+from voluptuous import (
+  Any,
+  ExactSequence,
+  MultipleInvalid,
+  Object,
+  Required,
+  Schema,
+)
+from transformers import CLIPTokenizer
+from . import train_util
+from .train_util import (
+  DreamBoothSubset,
+  FineTuningSubset,
+  DreamBoothDataset,
+  FineTuningDataset,
+  DatasetGroup,
+)
+def add_config_arguments(parser: argparse.ArgumentParser):
+  parser.add_argument("--dataset_config", type=Path, default=None, help="config file for detail settings / 詳細な設定用の設定ファイル")
+# TODO: inherit Params class in Subset, Dataset
+@dataclass
+class BaseSubsetParams:
+  image_dir: Optional[str] = None
+  num_repeats: int = 1
+  shuffle_caption: bool = False
+  keep_tokens: int = 0
+  color_aug: bool = False
+  flip_aug: bool = False
+  face_crop_aug_range: Optional[Tuple[float, float]] = None
+  random_crop: bool = False
+  caption_dropout_rate: float = 0.0
+  caption_dropout_every_n_epochs: int = 0
+  caption_tag_dropout_rate: float = 0.0
+  token_warmup_min: int = 1
+  token_warmup_step: float = 0
+@dataclass
+class DreamBoothSubsetParams(BaseSubsetParams):
+  is_reg: bool = False
+  class_tokens: Optional[str] = None
+  caption_extension: str = ".caption"
+@dataclass
+class FineTuningSubsetParams(BaseSubsetParams):
+  metadata_file: Optional[str] = None
+@dataclass
+class BaseDatasetParams:
+  tokenizer: CLIPTokenizer = None
+  max_token_length: int = None
+  resolution: Optional[Tuple[int, int]] = None
+  debug_dataset: bool = False
+@dataclass
+class DreamBoothDatasetParams(BaseDatasetParams):
+  batch_size: int = 1
+  enable_bucket: bool = False
+  min_bucket_reso: int = 256
+  max_bucket_reso: int = 1024
+  bucket_reso_steps: int = 64
+  bucket_no_upscale: bool = False
+  prior_loss_weight: float = 1.0
+@dataclass
+class FineTuningDatasetParams(BaseDatasetParams):
+  batch_size: int = 1
+  enable_bucket: bool = False
+  min_bucket_reso: int = 256
+  max_bucket_reso: int = 1024
+  bucket_reso_steps: int = 64
+  bucket_no_upscale: bool = False
+@dataclass
+class SubsetBlueprint:
+  params: Union[DreamBoothSubsetParams, FineTuningSubsetParams]
+@dataclass
+class DatasetBlueprint:
+  is_dreambooth: bool
+  params: Union[DreamBoothDatasetParams, FineTuningDatasetParams]
+  subsets: Sequence[SubsetBlueprint]
+@dataclass
+class DatasetGroupBlueprint:
+  datasets: Sequence[DatasetBlueprint]
+@dataclass
+class Blueprint:
+  dataset_group: DatasetGroupBlueprint
+class ConfigSanitizer:
+  # @curry
+  @staticmethod
+  def __validate_and_convert_twodim(klass, value: Sequence) -> Tuple:
+    Schema(ExactSequence([klass, klass]))(value)
+    return tuple(value)
+  # @curry
+  @staticmethod
+  def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]) -> Tuple:
+    Schema(Any(klass, ExactSequence([klass, klass])))(value)
+    try:
+      Schema(klass)(value)
+      return (value, value)
+    except:
+      return ConfigSanitizer.__validate_and_convert_twodim(klass, value)
+  # subset schema
+  SUBSET_ASCENDABLE_SCHEMA = {
+    "color_aug": bool,
+    "face_crop_aug_range": functools.partial(__validate_and_convert_twodim.__func__, float),
+    "flip_aug": bool,
+    "num_repeats": int,
+    "random_crop": bool,
+    "shuffle_caption": bool,
+    "keep_tokens": int,
+    "token_warmup_min": int,
+    "token_warmup_step": Any(float,int),
+  }
+  # DO means DropOut
+  DO_SUBSET_ASCENDABLE_SCHEMA = {
+    "caption_dropout_every_n_epochs": int,
+    "caption_dropout_rate": Any(float, int),
+    "caption_tag_dropout_rate": Any(float, int),
+  }
+  # DB means DreamBooth
+  DB_SUBSET_ASCENDABLE_SCHEMA = {
+    "caption_extension": str,
+    "class_tokens": str,
+  }
+  DB_SUBSET_DISTINCT_SCHEMA = {
+    Required("image_dir"): str,
+    "is_reg": bool,
+  }
+  # FT means FineTuning
+  FT_SUBSET_DISTINCT_SCHEMA = {
+    Required("metadata_file"): str,
+    "image_dir": str,
+  }
+  # datasets schema
+  DATASET_ASCENDABLE_SCHEMA = {
+    "batch_size": int,
+    "bucket_no_upscale": bool,
+    "bucket_reso_steps": int,
+    "enable_bucket": bool,
+    "max_bucket_reso": int,
+    "min_bucket_reso": int,
+    "resolution": functools.partial(__validate_and_convert_scalar_or_twodim.__func__, int),
+  }
+  # options handled by argparse but not handled by user config
+  ARGPARSE_SPECIFIC_SCHEMA = {
+    "debug_dataset": bool,
+    "max_token_length": Any(None, int),
+    "prior_loss_weight": Any(float, int),
+  }
+  # for handling default None value of argparse
+  ARGPARSE_NULLABLE_OPTNAMES = [
+    "face_crop_aug_range",
+    "resolution",
+  ]
+  # prepare map because option name may differ among argparse and user config
+  ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME = {
+    "train_batch_size": "batch_size",
+    "dataset_repeats": "num_repeats",
+  }
+  def __init__(self, support_dreambooth: bool, support_finetuning: bool, support_dropout: bool) -> None:
+    assert support_dreambooth or support_finetuning, "Neither DreamBooth mode nor fine tuning mode specified. Please specify one mode or more. / DreamBooth モードか fine tuning モードのどちらも指定されていません。1つ以上指定してください。"
+    self.db_subset_schema = self.__merge_dict(
+      self.SUBSET_ASCENDABLE_SCHEMA,
+      self.DB_SUBSET_DISTINCT_SCHEMA,
+      self.DB_SUBSET_ASCENDABLE_SCHEMA,
+      self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+    )
+    self.ft_subset_schema = self.__merge_dict(
+      self.SUBSET_ASCENDABLE_SCHEMA,
+      self.FT_SUBSET_DISTINCT_SCHEMA,
+      self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+    )
+    self.db_dataset_schema = self.__merge_dict(
+      self.DATASET_ASCENDABLE_SCHEMA,
+      self.SUBSET_ASCENDABLE_SCHEMA,
+      self.DB_SUBSET_ASCENDABLE_SCHEMA,
+      self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+      {"subsets": [self.db_subset_schema]},
+    )
+    self.ft_dataset_schema = self.__merge_dict(
+      self.DATASET_ASCENDABLE_SCHEMA,
+      self.SUBSET_ASCENDABLE_SCHEMA,
+      self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+      {"subsets": [self.ft_subset_schema]},
+    )
+    if support_dreambooth and support_finetuning:
+      def validate_flex_dataset(dataset_config: dict):
+        subsets_config = dataset_config.get("subsets", [])
+        # check dataset meets FT style
+        # NOTE: all FT subsets should have "metadata_file"
+        if all(["metadata_file" in subset for subset in subsets_config]):
+          return Schema(self.ft_dataset_schema)(dataset_config)
+        # check dataset meets DB style
+        # NOTE: all DB subsets should have no "metadata_file"
+        elif all(["metadata_file" not in subset for subset in subsets_config]):
+          return Schema(self.db_dataset_schema)(dataset_config)
+        else:
+          raise voluptuous.Invalid("DreamBooth subset and fine tuning subset cannot be mixed in the same dataset. Please split them into separate datasets. / DreamBoothのサブセットとfine tuninのサブセットを同一のデータセットに混在させることはできません。別々のデータセットに分割してください。")
+      self.dataset_schema = validate_flex_dataset
+    elif support_dreambooth:
+      self.dataset_schema = self.db_dataset_schema
+    else:
+      self.dataset_schema = self.ft_dataset_schema
+    self.general_schema = self.__merge_dict(
+      self.DATASET_ASCENDABLE_SCHEMA,
+      self.SUBSET_ASCENDABLE_SCHEMA,
+      self.DB_SUBSET_ASCENDABLE_SCHEMA if support_dreambooth else {},
+      self.DO_SUBSET_ASCENDABLE_SCHEMA if support_dropout else {},
+    )
+    self.user_config_validator = Schema({
+      "general": self.general_schema,
+      "datasets": [self.dataset_schema],
+    })
+    self.argparse_schema = self.__merge_dict(
+      self.general_schema,
+      self.ARGPARSE_SPECIFIC_SCHEMA,
+      {optname: Any(None, self.general_schema[optname]) for optname in self.ARGPARSE_NULLABLE_OPTNAMES},
+      {a_name: self.general_schema[c_name] for a_name, c_name in self.ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME.items()},
+    )
+    self.argparse_config_validator = Schema(Object(self.argparse_schema), extra=voluptuous.ALLOW_EXTRA)
+  def sanitize_user_config(self, user_config: dict) -> dict:
+    try:
+      return self.user_config_validator(user_config)
+    except MultipleInvalid:
+      # TODO: エラー発生時のメッセージをわかりやすくする
+      print("Invalid user config / ユーザ設定の形式が正しくないようです")
+      raise
+  # NOTE: In nature, argument parser result is not needed to be sanitize
+  #   However this will help us to detect program bug
+  def sanitize_argparse_namespace(self, argparse_namespace: argparse.Namespace) -> argparse.Namespace:
+    try:
+      return self.argparse_config_validator(argparse_namespace)
+    except MultipleInvalid:
+      # XXX: this should be a bug
+      print("Invalid cmdline parsed arguments. This should be a bug. / コマンドラインのパース結果が正しくないようです。プログラムのバグの可能性が高いです。")
+      raise
+  # NOTE: value would be overwritten by latter dict if there is already the same key
+  @staticmethod
+  def __merge_dict(*dict_list: dict) -> dict:
+    merged = {}
+    for schema in dict_list:
+      # merged |= schema
+      for k, v in schema.items():
+        merged[k] = v
+    return merged
+class BlueprintGenerator:
+  BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME = {
+  }
+  def __init__(self, sanitizer: ConfigSanitizer):
+    self.sanitizer = sanitizer
+  # runtime_params is for parameters which is only configurable on runtime, such as tokenizer
+  def generate(self, user_config: dict, argparse_namespace: argparse.Namespace, **runtime_params) -> Blueprint:
+    sanitized_user_config = self.sanitizer.sanitize_user_config(user_config)
+    sanitized_argparse_namespace = self.sanitizer.sanitize_argparse_namespace(argparse_namespace)
+    # convert argparse namespace to dict like config
+    # NOTE: it is ok to have extra entries in dict
+    optname_map = self.sanitizer.ARGPARSE_OPTNAME_TO_CONFIG_OPTNAME
+    argparse_config = {optname_map.get(optname, optname): value for optname, value in vars(sanitized_argparse_namespace).items()}
+    general_config = sanitized_user_config.get("general", {})
+    dataset_blueprints = []
+    for dataset_config in sanitized_user_config.get("datasets", []):
+      # NOTE: if subsets have no "metadata_file", these are DreamBooth datasets/subsets
+      subsets = dataset_config.get("subsets", [])
+      is_dreambooth = all(["metadata_file" not in subset for subset in subsets])
+      if is_dreambooth:
+        subset_params_klass = DreamBoothSubsetParams
+        dataset_params_klass = DreamBoothDatasetParams
+      else:
+        subset_params_klass = FineTuningSubsetParams
+        dataset_params_klass = FineTuningDatasetParams
+      subset_blueprints = []
+      for subset_config in subsets:
+        params = self.generate_params_by_fallbacks(subset_params_klass,
+                                                   [subset_config, dataset_config, general_config, argparse_config, runtime_params])
+        subset_blueprints.append(SubsetBlueprint(params))
+      params = self.generate_params_by_fallbacks(dataset_params_klass,
+                                                 [dataset_config, general_config, argparse_config, runtime_params])
+      dataset_blueprints.append(DatasetBlueprint(is_dreambooth, params, subset_blueprints))
+    dataset_group_blueprint = DatasetGroupBlueprint(dataset_blueprints)
+    return Blueprint(dataset_group_blueprint)
+  @staticmethod
+  def generate_params_by_fallbacks(param_klass, fallbacks: Sequence[dict]):
+    name_map = BlueprintGenerator.BLUEPRINT_PARAM_NAME_TO_CONFIG_OPTNAME
+    search_value = BlueprintGenerator.search_value
+    default_params = asdict(param_klass())
+    param_names = default_params.keys()
+    params = {name: search_value(name_map.get(name, name), fallbacks, default_params.get(name)) for name in param_names}
+    return param_klass(**params)
+  @staticmethod
+  def search_value(key: str, fallbacks: Sequence[dict], default_value = None):
+    for cand in fallbacks:
+      value = cand.get(key)
+      if value is not None:
+        return value
+    return default_value
+def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlueprint):
+  datasets: List[Union[DreamBoothDataset, FineTuningDataset]] = []
+  for dataset_blueprint in dataset_group_blueprint.datasets:
+    if dataset_blueprint.is_dreambooth:
+      subset_klass = DreamBoothSubset
+      dataset_klass = DreamBoothDataset
+    else:
+      subset_klass = FineTuningSubset
+      dataset_klass = FineTuningDataset
+    subsets = [subset_klass(**asdict(subset_blueprint.params)) for subset_blueprint in dataset_blueprint.subsets]
+    dataset = dataset_klass(subsets=subsets, **asdict(dataset_blueprint.params))
+    datasets.append(dataset)
+  # print info
+  info = ""
+  for i, dataset in enumerate(datasets):
+    is_dreambooth = isinstance(dataset, DreamBoothDataset)
+    info += dedent(f"""\
+      [Dataset {i}]
+        batch_size: {dataset.batch_size}
+        resolution: {(dataset.width, dataset.height)}
+        enable_bucket: {dataset.enable_bucket}
+    """)
+    if dataset.enable_bucket:
+      info += indent(dedent(f"""\
+        min_bucket_reso: {dataset.min_bucket_reso}
+        max_bucket_reso: {dataset.max_bucket_reso}
+        bucket_reso_steps: {dataset.bucket_reso_steps}
+        bucket_no_upscale: {dataset.bucket_no_upscale}
+      \n"""), "  ")
+    else:
+      info += "\n"
+    for j, subset in enumerate(dataset.subsets):
+      info += indent(dedent(f"""\
+        [Subset {j} of Dataset {i}]
+          image_dir: "{subset.image_dir}"
+          image_count: {subset.img_count}
+          num_repeats: {subset.num_repeats}
+          shuffle_caption: {subset.shuffle_caption}
+          keep_tokens: {subset.keep_tokens}
+          caption_dropout_rate: {subset.caption_dropout_rate}
+          caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
+          caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}
+          color_aug: {subset.color_aug}
+          flip_aug: {subset.flip_aug}
+          face_crop_aug_range: {subset.face_crop_aug_range}
+          random_crop: {subset.random_crop}
+          token_warmup_min: {subset.token_warmup_min},
+          token_warmup_step: {subset.token_warmup_step},
+      """), "  ")
+      if is_dreambooth:
+        info += indent(dedent(f"""\
+          is_reg: {subset.is_reg}
+          class_tokens: {subset.class_tokens}
+          caption_extension: {subset.caption_extension}
+        \n"""), "    ")
+      else:
+        info += indent(dedent(f"""\
+          metadata_file: {subset.metadata_file}
+        \n"""), "    ")
+  print(info)
+  # make buckets first because it determines the length of dataset
+  # and set the same seed for all datasets
+  seed = random.randint(0, 2**31) # actual seed is seed + epoch_no
+  for i, dataset in enumerate(datasets):
+    print(f"[Dataset {i}]")
+    dataset.make_buckets()
+    dataset.set_seed(seed)
+  return DatasetGroup(datasets)
+def generate_dreambooth_subsets_config_by_subdirs(train_data_dir: Optional[str] = None, reg_data_dir: Optional[str] = None):
+  def extract_dreambooth_params(name: str) -> Tuple[int, str]:
+    tokens = name.split('_')
+    try:
+      n_repeats = int(tokens[0])
+    except ValueError as e:
+      print(f"ignore directory without repeats / 繰り返し回数のないディレクトリを無視します: {dir}")
+      return 0, ""
+    caption_by_folder = '_'.join(tokens[1:])
+    return n_repeats, caption_by_folder
+  def generate(base_dir: Optional[str], is_reg: bool):
+    if base_dir is None:
+      return []
+    base_dir: Path = Path(base_dir)
+    if not base_dir.is_dir():
+      return []
+    subsets_config = []
+    for subdir in base_dir.iterdir():
+      if not subdir.is_dir():
+        continue
+      num_repeats, class_tokens = extract_dreambooth_params(subdir.name)
+      if num_repeats < 1:
+        continue
+      subset_config = {"image_dir": str(subdir), "num_repeats": num_repeats, "is_reg": is_reg, "class_tokens": class_tokens}
+      subsets_config.append(subset_config)
+    return subsets_config
+  subsets_config = []
+  subsets_config += generate(train_data_dir, False)
+  subsets_config += generate(reg_data_dir, True)
+  return subsets_config
+def load_user_config(file: str) -> dict:
+  file: Path = Path(file)
+  if not file.is_file():
+    raise ValueError(f"file not found / ファイルが見つかりません: {file}")
+  if file.name.lower().endswith('.json'):
+    try:
+      config = json.load(file)
+    except Exception:
+      print(f"Error on parsing JSON config file. Please check the format. / JSON 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}")
+      raise
+  elif file.name.lower().endswith('.toml'):
+    try:
+      config = toml.load(file)
+    except Exception:
+      print(f"Error on parsing TOML config file. Please check the format. / TOML 形式の設定ファイルの読み込みに失敗しました。文法が正しいか確認してください。: {file}")
+      raise
+  else:
+    raise ValueError(f"not supported config file format / 対応していない設定ファイルの形式です: {file}")
+  return config
+# for config test
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--support_dreambooth", action="store_true")
+  parser.add_argument("--support_finetuning", action="store_true")
+  parser.add_argument("--support_dropout", action="store_true")
+  parser.add_argument("dataset_config")
+  config_args, remain = parser.parse_known_args()
+  parser = argparse.ArgumentParser()
+  train_util.add_dataset_arguments(parser, config_args.support_dreambooth, config_args.support_finetuning, config_args.support_dropout)
+  train_util.add_training_arguments(parser, config_args.support_dreambooth)
+  argparse_namespace = parser.parse_args(remain)
+  train_util.prepare_dataset_args(argparse_namespace, config_args.support_finetuning)
+  print("[argparse_namespace]")
+  print(vars(argparse_namespace))
+  user_config = load_user_config(config_args.dataset_config)
+  print("\n[user_config]")
+  print(user_config)
+  sanitizer = ConfigSanitizer(config_args.support_dreambooth, config_args.support_finetuning, config_args.support_dropout)
+  sanitized_user_config = sanitizer.sanitize_user_config(user_config)
+  print("\n[sanitized_user_config]")
+  print(sanitized_user_config)
+  blueprint = BlueprintGenerator(sanitizer).generate(user_config, argparse_namespace)
+  print("\n[blueprint]")
+  print(blueprint)

library/convert_model_gui.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import gradio as gr
+from easygui import msgbox
+import subprocess
+import os
+import shutil
+from .common_gui import get_folder_path, get_file_path
+folder_symbol = '\U0001f4c2'  # 📂
+refresh_symbol = '\U0001f504'  # 🔄
+save_style_symbol = '\U0001f4be'  # 💾
+document_symbol = '\U0001F4C4'   # 📄
+PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
+def convert_model(
+    source_model_input,
+    source_model_type,
+    target_model_folder_input,
+    target_model_name_input,
+    target_model_type,
+    target_save_precision_type,
+):
+    # Check for caption_text_input
+    if source_model_type == '':
+        msgbox('Invalid source model type')
+        return
+    # Check if source model exist
+    if os.path.isfile(source_model_input):
+        print('The provided source model is a file')
+    elif os.path.isdir(source_model_input):
+        print('The provided model is a folder')
+    else:
+        msgbox('The provided source model is neither a file nor a folder')
+        return
+    # Check if source model exist
+    if os.path.isdir(target_model_folder_input):
+        print('The provided model folder exist')
+    else:
+        msgbox('The provided target folder does not exist')
+        return
+    run_cmd = f'{PYTHON} "tools/convert_diffusers20_original_sd.py"'
+    v1_models = [
+        'runwayml/stable-diffusion-v1-5',
+        'CompVis/stable-diffusion-v1-4',
+    ]
+    # check if v1 models
+    if str(source_model_type) in v1_models:
+        print('SD v1 model specified. Setting --v1 parameter')
+        run_cmd += ' --v1'
+    else:
+        print('SD v2 model specified. Setting --v2 parameter')
+        run_cmd += ' --v2'
+    if not target_save_precision_type == 'unspecified':
+        run_cmd += f' --{target_save_precision_type}'
+    if (
+        target_model_type == 'diffuser'
+        or target_model_type == 'diffuser_safetensors'
+    ):
+        run_cmd += f' --reference_model="{source_model_type}"'
+    if target_model_type == 'diffuser_safetensors':
+        run_cmd += ' --use_safetensors'
+    run_cmd += f' "{source_model_input}"'
+    if (
+        target_model_type == 'diffuser'
+        or target_model_type == 'diffuser_safetensors'
+    ):
+        target_model_path = os.path.join(
+            target_model_folder_input, target_model_name_input
+        )
+        run_cmd += f' "{target_model_path}"'
+    else:
+        target_model_path = os.path.join(
+            target_model_folder_input,
+            f'{target_model_name_input}.{target_model_type}',
+        )
+        run_cmd += f' "{target_model_path}"'
+    print(run_cmd)
+    # Run the command
+    if os.name == 'posix':
+        os.system(run_cmd)
+    else:
+        subprocess.run(run_cmd)
+    if (
+        not target_model_type == 'diffuser'
+        or target_model_type == 'diffuser_safetensors'
+    ):
+        v2_models = [
+            'stabilityai/stable-diffusion-2-1-base',
+            'stabilityai/stable-diffusion-2-base',
+        ]
+        v_parameterization = [
+            'stabilityai/stable-diffusion-2-1',
+            'stabilityai/stable-diffusion-2',
+        ]
+        if str(source_model_type) in v2_models:
+            inference_file = os.path.join(
+                target_model_folder_input, f'{target_model_name_input}.yaml'
+            )
+            print(f'Saving v2-inference.yaml as {inference_file}')
+            shutil.copy(
+                f'./v2_inference/v2-inference.yaml',
+                f'{inference_file}',
+            )
+        if str(source_model_type) in v_parameterization:
+            inference_file = os.path.join(
+                target_model_folder_input, f'{target_model_name_input}.yaml'
+            )
+            print(f'Saving v2-inference-v.yaml as {inference_file}')
+            shutil.copy(
+                f'./v2_inference/v2-inference-v.yaml',
+                f'{inference_file}',
+            )
+#   parser = argparse.ArgumentParser()
+#   parser.add_argument("--v1", action='store_true',
+#                       help='load v1.x model (v1 or v2 is required to load checkpoint) / 1.xのモデルを読み込む')
+#   parser.add_argument("--v2", action='store_true',
+#                       help='load v2.0 model (v1 or v2 is required to load checkpoint) / 2.0のモデルを読み込む')
+#   parser.add_argument("--fp16", action='store_true',
+#                       help='load as fp16 (Diffusers only) and save as fp16 (checkpoint only) / fp16形式で読み込み（Diffusers形式のみ対応）、保存する（checkpointのみ対応）')
+#   parser.add_argument("--bf16", action='store_true', help='save as bf16 (checkpoint only) / bf16形式で保存する（checkpointのみ対応）')
+#   parser.add_argument("--float", action='store_true',
+#                       help='save as float (checkpoint only) / float(float32)形式で保存する（checkpointのみ対応）')
+#   parser.add_argument("--epoch", type=int, default=0, help='epoch to write to checkpoint / checkpointに記���するepoch数の値')
+#   parser.add_argument("--global_step", type=int, default=0,
+#                       help='global_step to write to checkpoint / checkpointに記録するglobal_stepの値')
+#   parser.add_argument("--reference_model", type=str, default=None,
+#                       help="reference model for schduler/tokenizer, required in saving Diffusers, copy schduler/tokenizer from this / scheduler/tokenizerのコピー元のDiffusersモデル、Diffusers形式で保存するときに必要")
+#   parser.add_argument("model_to_load", type=str, default=None,
+#                       help="model to load: checkpoint file or Diffusers model's directory / 読み込むモデル、checkpointかDiffusers形式モデルのディレクトリ")
+#   parser.add_argument("model_to_save", type=str, default=None,
+#                       help="model to save: checkpoint (with extension) or Diffusers model's directory (without extension) / 変換後のモデル、拡張子がある場合はcheckpoint、ない場合はDiffusesモデルとして保存")
+###
+# Gradio UI
+###
+def gradio_convert_model_tab():
+    with gr.Tab('Convert model'):
+        gr.Markdown(
+            'This utility can be used to convert from one stable diffusion model format to another.'
+        )
+        with gr.Row():
+            source_model_input = gr.Textbox(
+                label='Source model',
+                placeholder='path to source model folder of file to convert...',
+                interactive=True,
+            )
+            button_source_model_dir = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_source_model_dir.click(
+                get_folder_path,
+                outputs=source_model_input,
+                show_progress=False,
+            )
+            button_source_model_file = gr.Button(
+                document_symbol, elem_id='open_folder_small'
+            )
+            button_source_model_file.click(
+                get_file_path,
+                inputs=[source_model_input],
+                outputs=source_model_input,
+                show_progress=False,
+            )
+            source_model_type = gr.Dropdown(
+                label='Source model type',
+                choices=[
+                    'stabilityai/stable-diffusion-2-1-base',
+                    'stabilityai/stable-diffusion-2-base',
+                    'stabilityai/stable-diffusion-2-1',
+                    'stabilityai/stable-diffusion-2',
+                    'runwayml/stable-diffusion-v1-5',
+                    'CompVis/stable-diffusion-v1-4',
+                ],
+            )
+        with gr.Row():
+            target_model_folder_input = gr.Textbox(
+                label='Target model folder',
+                placeholder='path to target model folder of file name to create...',
+                interactive=True,
+            )
+            button_target_model_folder = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_target_model_folder.click(
+                get_folder_path,
+                outputs=target_model_folder_input,
+                show_progress=False,
+            )
+            target_model_name_input = gr.Textbox(
+                label='Target model name',
+                placeholder='target model name...',
+                interactive=True,
+            )
+            target_model_type = gr.Dropdown(
+                label='Target model type',
+                choices=[
+                    'diffuser',
+                    'diffuser_safetensors',
+                    'ckpt',
+                    'safetensors',
+                ],
+            )
+            target_save_precision_type = gr.Dropdown(
+                label='Target model precision',
+                choices=['unspecified', 'fp16', 'bf16', 'float'],
+                value='unspecified',
+            )
+        convert_button = gr.Button('Convert model')
+        convert_button.click(
+            convert_model,
+            inputs=[
+                source_model_input,
+                source_model_type,
+                target_model_folder_input,
+                target_model_name_input,
+                target_model_type,
+                target_save_precision_type,
+            ],
+            show_progress=False,
+        )

library/custom_train_functions.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+import argparse
+def apply_snr_weight(loss, timesteps, noise_scheduler, gamma):
+  alphas_cumprod = noise_scheduler.alphas_cumprod
+  sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
+  sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
+  alpha = sqrt_alphas_cumprod
+  sigma = sqrt_one_minus_alphas_cumprod
+  all_snr = (alpha / sigma) ** 2
+  snr = torch.stack([all_snr[t] for t in timesteps])
+  gamma_over_snr = torch.div(torch.ones_like(snr)*gamma,snr)
+  snr_weight = torch.minimum(gamma_over_snr,torch.ones_like(gamma_over_snr)).float() #from paper
+  loss = loss * snr_weight
+  return loss
+def add_custom_train_arguments(parser: argparse.ArgumentParser):
+  parser.add_argument("--min_snr_gamma", type=float, default=None, help="gamma for reducing the weight of high loss timesteps. Lower numbers have stronger effect. 5 is recommended by paper. / 低いタイムステップでの高いlossに対して重みを減らすためのgamma値、低いほど効果が強く、論文では5が推奨")

library/dataset_balancing_gui.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+import re
+import gradio as gr
+from easygui import msgbox, boolbox
+from .common_gui import get_folder_path
+# def select_folder():
+#     # Open a file dialog to select a directory
+#     folder = filedialog.askdirectory()
+#     # Update the GUI to display the selected folder
+#     selected_folder_label.config(text=folder)
+def dataset_balancing(concept_repeats, folder, insecure):
+    if not concept_repeats > 0:
+        # Display an error message if the total number of repeats is not a valid integer
+        msgbox('Please enter a valid integer for the total number of repeats.')
+        return
+    concept_repeats = int(concept_repeats)
+    # Check if folder exist
+    if folder == '' or not os.path.isdir(folder):
+        msgbox('Please enter a valid folder for balancing.')
+        return
+    pattern = re.compile(r'^\d+_.+$')
+    # Iterate over the subdirectories in the selected folder
+    for subdir in os.listdir(folder):
+        if pattern.match(subdir) or insecure:
+            # Calculate the number of repeats for the current subdirectory
+            # Get a list of all the files in the folder
+            files = os.listdir(os.path.join(folder, subdir))
+            # Filter the list to include only image files
+            image_files = [
+                f
+                for f in files
+                if f.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp'))
+            ]
+            # Count the number of image files
+            images = len(image_files)
+            # Check if the subdirectory name starts with a number inside braces,
+            # indicating that the repeats value should be multiplied
+            match = re.match(r'^\{(\d+\.?\d*)\}', subdir)
+            if match:
+                # Multiply the repeats value by the number inside the braces
+                if not images == 0:
+                    repeats = max(
+                        1,
+                        round(
+                            concept_repeats / images * float(match.group(1))
+                        ),
+                    )
+                else:
+                    repeats = 0
+                subdir = subdir[match.end() :]
+            else:
+                if not images == 0:
+                    repeats = max(1, round(concept_repeats / images))
+                else:
+                    repeats = 0
+            # Check if the subdirectory name already has a number at the beginning
+            match = re.match(r'^\d+_', subdir)
+            if match:
+                # Replace the existing number with the new number
+                old_name = os.path.join(folder, subdir)
+                new_name = os.path.join(
+                    folder, f'{repeats}_{subdir[match.end():]}'
+                )
+            else:
+                # Add the new number at the beginning of the name
+                old_name = os.path.join(folder, subdir)
+                new_name = os.path.join(folder, f'{repeats}_{subdir}')
+            os.rename(old_name, new_name)
+        else:
+            print(
+                f'Skipping folder {subdir} because it does not match kohya_ss expected syntax...'
+            )
+    msgbox('Dataset balancing completed...')
+def warning(insecure):
+    if insecure:
+        if boolbox(
+            f'WARNING!!! You have asked to rename non kohya_ss <num>_<text> folders...\n\nAre you sure you want to do that?',
+            choices=('Yes, I like danger', 'No, get me out of here'),
+        ):
+            return True
+        else:
+            return False
+def gradio_dataset_balancing_tab():
+    with gr.Tab('Dreambooth/LoRA Dataset balancing'):
+        gr.Markdown(
+            'This utility will ensure that each concept folder in the dataset folder is used equally during the training process of the dreambooth machine learning model, regardless of the number of images in each folder. It will do this by renaming the concept folders to indicate the number of times they should be repeated during training.'
+        )
+        gr.Markdown(
+            'WARNING! The use of this utility on the wrong folder can lead to unexpected folder renaming!!!'
+        )
+        with gr.Row():
+            select_dataset_folder_input = gr.Textbox(
+                label='Dataset folder',
+                placeholder='Folder containing the concepts folders to balance...',
+                interactive=True,
+            )
+            select_dataset_folder_button = gr.Button(
+                '📂', elem_id='open_folder_small'
+            )
+            select_dataset_folder_button.click(
+                get_folder_path,
+                outputs=select_dataset_folder_input,
+                show_progress=False,
+            )
+            total_repeats_number = gr.Number(
+                value=1000,
+                interactive=True,
+                label='Training steps per concept per epoch',
+            )
+        with gr.Accordion('Advanced options', open=False):
+            insecure = gr.Checkbox(
+                value=False,
+                label='DANGER!!! -- Insecure folder renaming -- DANGER!!!',
+            )
+            insecure.change(warning, inputs=insecure, outputs=insecure)
+        balance_button = gr.Button('Balance dataset')
+        balance_button.click(
+            dataset_balancing,
+            inputs=[
+                total_repeats_number,
+                select_dataset_folder_input,
+                insecure,
+            ],
+            show_progress=False,
+        )

library/dreambooth_folder_creation_gui.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import gradio as gr
+from easygui import diropenbox, msgbox
+from .common_gui import get_folder_path
+import shutil
+import os
+def copy_info_to_Folders_tab(training_folder):
+    img_folder = os.path.join(training_folder, 'img')
+    if os.path.exists(os.path.join(training_folder, 'reg')):
+        reg_folder = os.path.join(training_folder, 'reg')
+    else:
+        reg_folder = ''
+    model_folder = os.path.join(training_folder, 'model')
+    log_folder = os.path.join(training_folder, 'log')
+    return img_folder, reg_folder, model_folder, log_folder
+def dreambooth_folder_preparation(
+    util_training_images_dir_input,
+    util_training_images_repeat_input,
+    util_instance_prompt_input,
+    util_regularization_images_dir_input,
+    util_regularization_images_repeat_input,
+    util_class_prompt_input,
+    util_training_dir_output,
+):
+    # Check if the input variables are empty
+    if not len(util_training_dir_output):
+        print(
+            "Destination training directory is missing... can't perform the required task..."
+        )
+        return
+    else:
+        # Create the util_training_dir_output directory if it doesn't exist
+        os.makedirs(util_training_dir_output, exist_ok=True)
+    # Check for instance prompt
+    if util_instance_prompt_input == '':
+        msgbox('Instance prompt missing...')
+        return
+    # Check for class prompt
+    if util_class_prompt_input == '':
+        msgbox('Class prompt missing...')
+        return
+    # Create the training_dir path
+    if util_training_images_dir_input == '':
+        print(
+            "Training images directory is missing... can't perform the required task..."
+        )
+        return
+    else:
+        training_dir = os.path.join(
+            util_training_dir_output,
+            f'img/{int(util_training_images_repeat_input)}_{util_instance_prompt_input} {util_class_prompt_input}',
+        )
+        # Remove folders if they exist
+        if os.path.exists(training_dir):
+            print(f'Removing existing directory {training_dir}...')
+            shutil.rmtree(training_dir)
+        # Copy the training images to their respective directories
+        print(f'Copy {util_training_images_dir_input} to {training_dir}...')
+        shutil.copytree(util_training_images_dir_input, training_dir)
+    if not util_regularization_images_dir_input == '':
+        # Create the regularization_dir path
+        if not util_regularization_images_repeat_input > 0:
+            print('Repeats is missing... not copying regularisation images...')
+        else:
+            regularization_dir = os.path.join(
+                util_training_dir_output,
+                f'reg/{int(util_regularization_images_repeat_input)}_{util_class_prompt_input}',
+            )
+            # Remove folders if they exist
+            if os.path.exists(regularization_dir):
+                print(f'Removing existing directory {regularization_dir}...')
+                shutil.rmtree(regularization_dir)
+            # Copy the regularisation images to their respective directories
+            print(
+                f'Copy {util_regularization_images_dir_input} to {regularization_dir}...'
+            )
+            shutil.copytree(
+                util_regularization_images_dir_input, regularization_dir
+            )
+    else:
+        print(
+            'Regularization images directory is missing... not copying regularisation images...'
+        )
+    # create log and model folder
+    # Check if the log folder exists and create it if it doesn't
+    if not os.path.exists(os.path.join(util_training_dir_output, 'log')):
+        os.makedirs(os.path.join(util_training_dir_output, 'log'))
+    # Check if the model folder exists and create it if it doesn't
+    if not os.path.exists(os.path.join(util_training_dir_output, 'model')):
+        os.makedirs(os.path.join(util_training_dir_output, 'model'))
+    print(
+        f'Done creating kohya_ss training folder structure at {util_training_dir_output}...'
+    )
+def gradio_dreambooth_folder_creation_tab(
+    train_data_dir_input=gr.Textbox(),
+    reg_data_dir_input=gr.Textbox(),
+    output_dir_input=gr.Textbox(),
+    logging_dir_input=gr.Textbox(),
+):
+    with gr.Tab('Dreambooth/LoRA Folder preparation'):
+        gr.Markdown(
+            'This utility will create the necessary folder structure for the training images and optional regularization images needed for the kohys_ss Dreambooth/LoRA method to function correctly.'
+        )
+        with gr.Row():
+            util_instance_prompt_input = gr.Textbox(
+                label='Instance prompt',
+                placeholder='Eg: asd',
+                interactive=True,
+            )
+            util_class_prompt_input = gr.Textbox(
+                label='Class prompt',
+                placeholder='Eg: person',
+                interactive=True,
+            )
+        with gr.Row():
+            util_training_images_dir_input = gr.Textbox(
+                label='Training images',
+                placeholder='Directory containing the training images',
+                interactive=True,
+            )
+            button_util_training_images_dir_input = gr.Button(
+                '📂', elem_id='open_folder_small'
+            )
+            button_util_training_images_dir_input.click(
+                get_folder_path,
+                outputs=util_training_images_dir_input,
+                show_progress=False,
+            )
+            util_training_images_repeat_input = gr.Number(
+                label='Repeats',
+                value=40,
+                interactive=True,
+                elem_id='number_input',
+            )
+        with gr.Row():
+            util_regularization_images_dir_input = gr.Textbox(
+                label='Regularisation images',
+                placeholder='(Optional) Directory containing the regularisation images',
+                interactive=True,
+            )
+            button_util_regularization_images_dir_input = gr.Button(
+                '📂', elem_id='open_folder_small'
+            )
+            button_util_regularization_images_dir_input.click(
+                get_folder_path,
+                outputs=util_regularization_images_dir_input,
+                show_progress=False,
+            )
+            util_regularization_images_repeat_input = gr.Number(
+                label='Repeats',
+                value=1,
+                interactive=True,
+                elem_id='number_input',
+            )
+        with gr.Row():
+            util_training_dir_output = gr.Textbox(
+                label='Destination training directory',
+                placeholder='Directory where formatted training and regularisation folders will be placed',
+                interactive=True,
+            )
+            button_util_training_dir_output = gr.Button(
+                '📂', elem_id='open_folder_small'
+            )
+            button_util_training_dir_output.click(
+                get_folder_path, outputs=util_training_dir_output
+            )
+        button_prepare_training_data = gr.Button('Prepare training data')
+        button_prepare_training_data.click(
+            dreambooth_folder_preparation,
+            inputs=[
+                util_training_images_dir_input,
+                util_training_images_repeat_input,
+                util_instance_prompt_input,
+                util_regularization_images_dir_input,
+                util_regularization_images_repeat_input,
+                util_class_prompt_input,
+                util_training_dir_output,
+            ],
+            show_progress=False,
+        )
+        button_copy_info_to_Folders_tab = gr.Button('Copy info to Folders Tab')
+        button_copy_info_to_Folders_tab.click(
+            copy_info_to_Folders_tab,
+            inputs=[util_training_dir_output],
+            outputs=[
+                train_data_dir_input,
+                reg_data_dir_input,
+                output_dir_input,
+                logging_dir_input,
+            ],
+            show_progress=False,
+        )

library/extract_lora_gui.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import gradio as gr
+from easygui import msgbox
+import subprocess
+import os
+from .common_gui import (
+    get_saveasfilename_path,
+    get_any_file_path,
+    get_file_path,
+)
+folder_symbol = '\U0001f4c2'  # 📂
+refresh_symbol = '\U0001f504'  # 🔄
+save_style_symbol = '\U0001f4be'  # 💾
+document_symbol = '\U0001F4C4'   # 📄
+PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
+def extract_lora(
+    model_tuned,
+    model_org,
+    save_to,
+    save_precision,
+    dim,
+    v2,
+    conv_dim,
+    device,
+):
+    # Check for caption_text_input
+    if model_tuned == '':
+        msgbox('Invalid finetuned model file')
+        return
+    if model_org == '':
+        msgbox('Invalid base model file')
+        return
+    # Check if source model exist
+    if not os.path.isfile(model_tuned):
+        msgbox('The provided finetuned model is not a file')
+        return
+    if not os.path.isfile(model_org):
+        msgbox('The provided base model is not a file')
+        return
+    run_cmd = (
+        f'{PYTHON} "{os.path.join("networks","extract_lora_from_models.py")}"'
+    )
+    run_cmd += f' --save_precision {save_precision}'
+    run_cmd += f' --save_to "{save_to}"'
+    run_cmd += f' --model_org "{model_org}"'
+    run_cmd += f' --model_tuned "{model_tuned}"'
+    run_cmd += f' --dim {dim}'
+    run_cmd += f' --device {device}'
+    if conv_dim > 0:
+        run_cmd += f' --conv_dim {conv_dim}'
+    if v2:
+        run_cmd += f' --v2'
+    print(run_cmd)
+    # Run the command
+    if os.name == 'posix':
+        os.system(run_cmd)
+    else:
+        subprocess.run(run_cmd)
+###
+# Gradio UI
+###
+def gradio_extract_lora_tab():
+    with gr.Tab('Extract LoRA'):
+        gr.Markdown(
+            'This utility can extract a LoRA network from a finetuned model.'
+        )
+        lora_ext = gr.Textbox(value='*.safetensors *.pt', visible=False)
+        lora_ext_name = gr.Textbox(value='LoRA model types', visible=False)
+        model_ext = gr.Textbox(value='*.ckpt *.safetensors', visible=False)
+        model_ext_name = gr.Textbox(value='Model types', visible=False)
+        with gr.Row():
+            model_tuned = gr.Textbox(
+                label='Finetuned model',
+                placeholder='Path to the finetuned model to extract',
+                interactive=True,
+            )
+            button_model_tuned_file = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_model_tuned_file.click(
+                get_file_path,
+                inputs=[model_tuned, model_ext, model_ext_name],
+                outputs=model_tuned,
+                show_progress=False,
+            )
+            model_org = gr.Textbox(
+                label='Stable Diffusion base model',
+                placeholder='Stable Diffusion original model: ckpt or safetensors file',
+                interactive=True,
+            )
+            button_model_org_file = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_model_org_file.click(
+                get_file_path,
+                inputs=[model_org, model_ext, model_ext_name],
+                outputs=model_org,
+                show_progress=False,
+            )
+        with gr.Row():
+            save_to = gr.Textbox(
+                label='Save to',
+                placeholder='path where to save the extracted LoRA model...',
+                interactive=True,
+            )
+            button_save_to = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_save_to.click(
+                get_saveasfilename_path,
+                inputs=[save_to, lora_ext, lora_ext_name],
+                outputs=save_to,
+                show_progress=False,
+            )
+            save_precision = gr.Dropdown(
+                label='Save precision',
+                choices=['fp16', 'bf16', 'float'],
+                value='float',
+                interactive=True,
+            )
+        with gr.Row():
+            dim = gr.Slider(
+                minimum=4,
+                maximum=1024,
+                label='Network Dimension (Rank)',
+                value=128,
+                step=1,
+                interactive=True,
+            )
+            conv_dim = gr.Slider(
+                minimum=0,
+                maximum=1024,
+                label='Conv Dimension (Rank)',
+                value=128,
+                step=1,
+                interactive=True,
+            )
+            v2 = gr.Checkbox(label='v2', value=False, interactive=True)
+            device = gr.Dropdown(
+                label='Device',
+                choices=[
+                    'cpu',
+                    'cuda',
+                ],
+                value='cuda',
+                interactive=True,
+            )
+        extract_button = gr.Button('Extract LoRA model')
+        extract_button.click(
+            extract_lora,
+            inputs=[
+                model_tuned,
+                model_org,
+                save_to,
+                save_precision,
+                dim,
+                v2,
+                conv_dim,
+                device
+            ],
+            show_progress=False,
+        )

library/extract_lycoris_locon_gui.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import gradio as gr
+from easygui import msgbox
+import subprocess
+import os
+from .common_gui import (
+    get_saveasfilename_path,
+    get_any_file_path,
+    get_file_path,
+)
+folder_symbol = '\U0001f4c2'  # 📂
+refresh_symbol = '\U0001f504'  # 🔄
+save_style_symbol = '\U0001f4be'  # 💾
+document_symbol = '\U0001F4C4'   # 📄
+PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
+def extract_lycoris_locon(
+    db_model,
+    base_model,
+    output_name,
+    device,
+    is_v2,
+    mode,
+    linear_dim,
+    conv_dim,
+    linear_threshold,
+    conv_threshold,
+    linear_ratio,
+    conv_ratio,
+    linear_quantile,
+    conv_quantile,
+    use_sparse_bias,
+    sparsity,
+    disable_cp,
+):
+    # Check for caption_text_input
+    if db_model == '':
+        msgbox('Invalid finetuned model file')
+        return
+    if base_model == '':
+        msgbox('Invalid base model file')
+        return
+    # Check if source model exist
+    if not os.path.isfile(db_model):
+        msgbox('The provided finetuned model is not a file')
+        return
+    if not os.path.isfile(base_model):
+        msgbox('The provided base model is not a file')
+        return
+    run_cmd = f'{PYTHON} "{os.path.join("tools","lycoris_locon_extract.py")}"'
+    if is_v2:
+        run_cmd += f' --is_v2'
+    run_cmd += f' --device {device}'
+    run_cmd += f' --mode {mode}'
+    run_cmd += f' --safetensors'
+    run_cmd += f' --linear_dim {linear_dim}'
+    run_cmd += f' --conv_dim {conv_dim}'
+    run_cmd += f' --linear_threshold {linear_threshold}'
+    run_cmd += f' --conv_threshold {conv_threshold}'
+    run_cmd += f' --linear_ratio {linear_ratio}'
+    run_cmd += f' --conv_ratio {conv_ratio}'
+    run_cmd += f' --linear_quantile {linear_quantile}'
+    run_cmd += f' --conv_quantile {conv_quantile}'
+    if use_sparse_bias:
+        run_cmd += f' --use_sparse_bias'
+    run_cmd += f' --sparsity {sparsity}'
+    if disable_cp:
+        run_cmd += f' --disable_cp'
+    run_cmd += f' "{base_model}"'
+    run_cmd += f' "{db_model}"'
+    run_cmd += f' "{output_name}"'
+    print(run_cmd)
+    # Run the command
+    if os.name == 'posix':
+        os.system(run_cmd)
+    else:
+        subprocess.run(run_cmd)
+###
+# Gradio UI
+###
+# def update_mode(mode):
+#     # 'fixed', 'threshold','ratio','quantile'
+#     if mode == 'fixed':
+#         return gr.Row.update(visible=True), gr.Row.update(visible=False), gr.Row.update(visible=False), gr.Row.update(visible=False)
+#     if mode == 'threshold':
+#         return gr.Row.update(visible=False), gr.Row.update(visible=True), gr.Row.update(visible=False), gr.Row.update(visible=False)
+#     if mode == 'ratio':
+#         return gr.Row.update(visible=False), gr.Row.update(visible=False), gr.Row.update(visible=True), gr.Row.update(visible=False)
+#     if mode == 'threshold':
+#         return gr.Row.update(visible=False), gr.Row.update(visible=False), gr.Row.update(visible=False), gr.Row.update(visible=True)
+def update_mode(mode):
+    # Create a list of possible mode values
+    modes = ['fixed', 'threshold', 'ratio', 'quantile']
+    # Initialize an empty list to store visibility updates
+    updates = []
+    # Iterate through the possible modes
+    for m in modes:
+        # Add a visibility update for each mode, setting it to True if the input mode matches the current mode in the loop
+        updates.append(gr.Row.update(visible=(mode == m)))
+    # Return the visibility updates as a tuple
+    return tuple(updates)
+def gradio_extract_lycoris_locon_tab():
+    with gr.Tab('Extract LyCORIS LoCON'):
+        gr.Markdown(
+            'This utility can extract a LyCORIS LoCon network from a finetuned model.'
+        )
+        lora_ext = gr.Textbox(
+            value='*.safetensors', visible=False
+        )   # lora_ext = gr.Textbox(value='*.safetensors *.pt', visible=False)
+        lora_ext_name = gr.Textbox(value='LoRA model types', visible=False)
+        model_ext = gr.Textbox(value='*.safetensors *.ckpt', visible=False)
+        model_ext_name = gr.Textbox(value='Model types', visible=False)
+        with gr.Row():
+            db_model = gr.Textbox(
+                label='Finetuned model',
+                placeholder='Path to the finetuned model to extract',
+                interactive=True,
+            )
+            button_db_model_file = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_db_model_file.click(
+                get_file_path,
+                inputs=[db_model, model_ext, model_ext_name],
+                outputs=db_model,
+                show_progress=False,
+            )
+            base_model = gr.Textbox(
+                label='Stable Diffusion base model',
+                placeholder='Stable Diffusion original model: ckpt or safetensors file',
+                interactive=True,
+            )
+            button_base_model_file = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_base_model_file.click(
+                get_file_path,
+                inputs=[base_model, model_ext, model_ext_name],
+                outputs=base_model,
+                show_progress=False,
+            )
+        with gr.Row():
+            output_name = gr.Textbox(
+                label='Save to',
+                placeholder='path where to save the extracted LoRA model...',
+                interactive=True,
+            )
+            button_output_name = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_output_name.click(
+                get_saveasfilename_path,
+                inputs=[output_name, lora_ext, lora_ext_name],
+                outputs=output_name,
+                show_progress=False,
+            )
+            device = gr.Dropdown(
+                label='Device',
+                choices=[
+                    'cpu',
+                    'cuda',
+                ],
+                value='cuda',
+                interactive=True,
+            )
+            is_v2 = gr.Checkbox(label='is v2', value=False, interactive=True)
+        mode = gr.Dropdown(
+            label='Mode',
+            choices=['fixed', 'threshold', 'ratio', 'quantile'],
+            value='fixed',
+            interactive=True,
+        )
+        with gr.Row(visible=True) as fixed:
+            linear_dim = gr.Slider(
+                minimum=1,
+                maximum=1024,
+                label='Network Dimension',
+                value=1,
+                step=1,
+                interactive=True,
+            )
+            conv_dim = gr.Slider(
+                minimum=1,
+                maximum=1024,
+                label='Conv Dimension',
+                value=1,
+                step=1,
+                interactive=True,
+            )
+        with gr.Row(visible=False) as threshold:
+            linear_threshold = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label='Linear threshold',
+                value=0,
+                step=0.01,
+                interactive=True,
+            )
+            conv_threshold = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label='Conv threshold',
+                value=0,
+                step=0.01,
+                interactive=True,
+            )
+        with gr.Row(visible=False) as ratio:
+            linear_ratio = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label='Linear ratio',
+                value=0,
+                step=0.01,
+                interactive=True,
+            )
+            conv_ratio = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label='Conv ratio',
+                value=0,
+                step=0.01,
+                interactive=True,
+            )
+        with gr.Row(visible=False) as quantile:
+            linear_quantile = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label='Linear quantile',
+                value=0.75,
+                step=0.01,
+                interactive=True,
+            )
+            conv_quantile = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label='Conv quantile',
+                value=0.75,
+                step=0.01,
+                interactive=True,
+            )
+        with gr.Row():
+            use_sparse_bias = gr.Checkbox(
+                label='Use sparse biais', value=False, interactive=True
+            )
+            sparsity = gr.Slider(
+                minimum=0,
+                maximum=1,
+                label='Sparsity',
+                value=0.98,
+                step=0.01,
+                interactive=True,
+            )
+            disable_cp = gr.Checkbox(
+                label='Disable CP decomposition', value=False, interactive=True
+            )
+        mode.change(
+            update_mode,
+            inputs=[mode],
+            outputs=[
+                fixed,
+                threshold,
+                ratio,
+                quantile,
+            ],
+        )
+        extract_button = gr.Button('Extract LyCORIS LoCon')
+        extract_button.click(
+            extract_lycoris_locon,
+            inputs=[
+                db_model,
+                base_model,
+                output_name,
+                device,
+                is_v2,
+                mode,
+                linear_dim,
+                conv_dim,
+                linear_threshold,
+                conv_threshold,
+                linear_ratio,
+                conv_ratio,
+                linear_quantile,
+                conv_quantile,
+                use_sparse_bias,
+                sparsity,
+                disable_cp,
+            ],
+            show_progress=False,
+        )

library/git_caption_gui.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import gradio as gr
+from easygui import msgbox
+import subprocess
+import os
+from .common_gui import get_folder_path, add_pre_postfix
+PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
+def caption_images(
+    train_data_dir,
+    caption_ext,
+    batch_size,
+    max_data_loader_n_workers,
+    max_length,
+    model_id,
+    prefix,
+    postfix,
+):
+    # Check for images_dir_input
+    if train_data_dir == '':
+        msgbox('Image folder is missing...')
+        return
+    if caption_ext == '':
+        msgbox('Please provide an extension for the caption files.')
+        return
+    print(f'GIT captioning files in {train_data_dir}...')
+    run_cmd = (
+        f'.\\venv\\Scripts\\python.exe "finetune/make_captions_by_git.py"'
+    )
+    if not model_id == '':
+        run_cmd += f' --model_id="{model_id}"'
+    run_cmd += f' --batch_size="{int(batch_size)}"'
+    run_cmd += (
+        f' --max_data_loader_n_workers="{int(max_data_loader_n_workers)}"'
+    )
+    run_cmd += f' --max_length="{int(max_length)}"'
+    if caption_ext != '':
+        run_cmd += f' --caption_extension="{caption_ext}"'
+    run_cmd += f' "{train_data_dir}"'
+    print(run_cmd)
+    # Run the command
+    subprocess.run(run_cmd)
+    # Add prefix and postfix
+    add_pre_postfix(
+        folder=train_data_dir,
+        caption_file_ext=caption_ext,
+        prefix=prefix,
+        postfix=postfix,
+    )
+    print('...captioning done')
+###
+# Gradio UI
+###
+def gradio_git_caption_gui_tab():
+    with gr.Tab('GIT Captioning'):
+        gr.Markdown(
+            'This utility will use GIT to caption files for each images in a folder.'
+        )
+        with gr.Row():
+            train_data_dir = gr.Textbox(
+                label='Image folder to caption',
+                placeholder='Directory containing the images to caption',
+                interactive=True,
+            )
+            button_train_data_dir_input = gr.Button(
+                '📂', elem_id='open_folder_small'
+            )
+            button_train_data_dir_input.click(
+                get_folder_path,
+                outputs=train_data_dir,
+                show_progress=False,
+            )
+        with gr.Row():
+            caption_ext = gr.Textbox(
+                label='Caption file extension',
+                placeholder='Extention for caption file. eg: .caption, .txt',
+                value='.txt',
+                interactive=True,
+            )
+            prefix = gr.Textbox(
+                label='Prefix to add to BLIP caption',
+                placeholder='(Optional)',
+                interactive=True,
+            )
+            postfix = gr.Textbox(
+                label='Postfix to add to BLIP caption',
+                placeholder='(Optional)',
+                interactive=True,
+            )
+            batch_size = gr.Number(
+                value=1, label='Batch size', interactive=True
+            )
+        with gr.Row():
+            max_data_loader_n_workers = gr.Number(
+                value=2, label='Number of workers', interactive=True
+            )
+            max_length = gr.Number(
+                value=75, label='Max length', interactive=True
+            )
+            model_id = gr.Textbox(
+                label='Model',
+                placeholder='(Optional) model id for GIT in Hugging Face',
+                interactive=True,
+            )
+        caption_button = gr.Button('Caption images')
+        caption_button.click(
+            caption_images,
+            inputs=[
+                train_data_dir,
+                caption_ext,
+                batch_size,
+                max_data_loader_n_workers,
+                max_length,
+                model_id,
+                prefix,
+                postfix,
+            ],
+            show_progress=False,
+        )

library/lpw_stable_diffusion.py ADDED Viewed

	@@ -0,0 +1,1179 @@

+# copy from https://github.com/huggingface/diffusers/blob/main/examples/community/lpw_stable_diffusion.py
+# and modify to support SD2.x
+import inspect
+import re
+from typing import Callable, List, Optional, Union
+import numpy as np
+import PIL
+import torch
+from packaging import version
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+import diffusers
+from diffusers import SchedulerMixin, StableDiffusionPipeline
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.utils import logging
+try:
+    from diffusers.utils import PIL_INTERPOLATION
+except ImportError:
+    if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+        PIL_INTERPOLATION = {
+            "linear": PIL.Image.Resampling.BILINEAR,
+            "bilinear": PIL.Image.Resampling.BILINEAR,
+            "bicubic": PIL.Image.Resampling.BICUBIC,
+            "lanczos": PIL.Image.Resampling.LANCZOS,
+            "nearest": PIL.Image.Resampling.NEAREST,
+        }
+    else:
+        PIL_INTERPOLATION = {
+            "linear": PIL.Image.LINEAR,
+            "bilinear": PIL.Image.BILINEAR,
+            "bicubic": PIL.Image.BICUBIC,
+            "lanczos": PIL.Image.LANCZOS,
+            "nearest": PIL.Image.NEAREST,
+        }
+# ------------------------------------------------------------------------------
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            res.append([text, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res
+def get_prompts_with_weights(pipe: StableDiffusionPipeline, prompt: List[str], max_length: int):
+    r"""
+    Tokenize a list of prompts and return its tokens with weights of each token.
+    No padding, starting or ending token is included.
+    """
+    tokens = []
+    weights = []
+    truncated = False
+    for text in prompt:
+        texts_and_weights = parse_prompt_attention(text)
+        text_token = []
+        text_weight = []
+        for word, weight in texts_and_weights:
+            # tokenize and discard the starting and the ending token
+            token = pipe.tokenizer(word).input_ids[1:-1]
+            text_token += token
+            # copy the weight by length of token
+            text_weight += [weight] * len(token)
+            # stop if the text is too long (longer than truncation limit)
+            if len(text_token) > max_length:
+                truncated = True
+                break
+        # truncate
+        if len(text_token) > max_length:
+            truncated = True
+            text_token = text_token[:max_length]
+            text_weight = text_weight[:max_length]
+        tokens.append(text_token)
+        weights.append(text_weight)
+    if truncated:
+        logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples")
+    return tokens, weights
+def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, no_boseos_middle=True, chunk_length=77):
+    r"""
+    Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length.
+    """
+    max_embeddings_multiples = (max_length - 2) // (chunk_length - 2)
+    weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length
+    for i in range(len(tokens)):
+        tokens[i] = [bos] + tokens[i] + [eos] * (max_length - 1 - len(tokens[i]))
+        if no_boseos_middle:
+            weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i]))
+        else:
+            w = []
+            if len(weights[i]) == 0:
+                w = [1.0] * weights_length
+            else:
+                for j in range(max_embeddings_multiples):
+                    w.append(1.0)  # weight for starting token in this chunk
+                    w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))]
+                    w.append(1.0)  # weight for ending token in this chunk
+                w += [1.0] * (weights_length - len(w))
+            weights[i] = w[:]
+    return tokens, weights
+def get_unweighted_text_embeddings(
+    pipe: StableDiffusionPipeline,
+    text_input: torch.Tensor,
+    chunk_length: int,
+    clip_skip: int,
+    eos: int,
+    pad: int,
+    no_boseos_middle: Optional[bool] = True,
+):
+    """
+    When the length of tokens is a multiple of the capacity of the text encoder,
+    it should be split into chunks and sent to the text encoder individually.
+    """
+    max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2)
+    if max_embeddings_multiples > 1:
+        text_embeddings = []
+        for i in range(max_embeddings_multiples):
+            # extract the i-th chunk
+            text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone()
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            if pad == eos:  # v1
+                text_input_chunk[:, -1] = text_input[0, -1]
+            else:  # v2
+                for j in range(len(text_input_chunk)):
+                    if text_input_chunk[j, -1] != eos and text_input_chunk[j, -1] != pad:  # 最後に普通の文字がある
+                        text_input_chunk[j, -1] = eos
+                    if text_input_chunk[j, 1] == pad:  # BOSだけであとはPAD
+                        text_input_chunk[j, 1] = eos
+            if clip_skip is None or clip_skip == 1:
+                text_embedding = pipe.text_encoder(text_input_chunk)[0]
+            else:
+                enc_out = pipe.text_encoder(text_input_chunk, output_hidden_states=True, return_dict=True)
+                text_embedding = enc_out["hidden_states"][-clip_skip]
+                text_embedding = pipe.text_encoder.text_model.final_layer_norm(text_embedding)
+            # cover the head and the tail by the starting and the ending tokens
+            text_input_chunk[:, 0] = text_input[0, 0]
+            text_input_chunk[:, -1] = text_input[0, -1]
+            text_embedding = pipe.text_encoder(text_input_chunk, attention_mask=None)[0]
+            if no_boseos_middle:
+                if i == 0:
+                    # discard the ending token
+                    text_embedding = text_embedding[:, :-1]
+                elif i == max_embeddings_multiples - 1:
+                    # discard the starting token
+                    text_embedding = text_embedding[:, 1:]
+                else:
+                    # discard both starting and ending tokens
+                    text_embedding = text_embedding[:, 1:-1]
+            text_embeddings.append(text_embedding)
+        text_embeddings = torch.concat(text_embeddings, axis=1)
+    else:
+        text_embeddings = pipe.text_encoder(text_input)[0]
+    return text_embeddings
+def get_weighted_text_embeddings(
+    pipe: StableDiffusionPipeline,
+    prompt: Union[str, List[str]],
+    uncond_prompt: Optional[Union[str, List[str]]] = None,
+    max_embeddings_multiples: Optional[int] = 3,
+    no_boseos_middle: Optional[bool] = False,
+    skip_parsing: Optional[bool] = False,
+    skip_weighting: Optional[bool] = False,
+    clip_skip=None,
+):
+    r"""
+    Prompts can be assigned with local weights using brackets. For example,
+    prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful',
+    and the embedding tokens corresponding to the words get multiplied by a constant, 1.1.
+    Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean.
+    Args:
+        pipe (`StableDiffusionPipeline`):
+            Pipe to provide access to the tokenizer and the text encoder.
+        prompt (`str` or `List[str]`):
+            The prompt or prompts to guide the image generation.
+        uncond_prompt (`str` or `List[str]`):
+            The unconditional prompt or prompts for guide the image generation. If unconditional prompt
+            is provided, the embeddings of prompt and uncond_prompt are concatenated.
+        max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+            The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        no_boseos_middle (`bool`, *optional*, defaults to `False`):
+            If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and
+            ending token in each of the chunk in the middle.
+        skip_parsing (`bool`, *optional*, defaults to `False`):
+            Skip the parsing of brackets.
+        skip_weighting (`bool`, *optional*, defaults to `False`):
+            Skip the weighting. When the parsing is skipped, it is forced True.
+    """
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    if not skip_parsing:
+        prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2)
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2)
+    else:
+        prompt_tokens = [token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids]
+        prompt_weights = [[1.0] * len(token) for token in prompt_tokens]
+        if uncond_prompt is not None:
+            if isinstance(uncond_prompt, str):
+                uncond_prompt = [uncond_prompt]
+            uncond_tokens = [
+                token[1:-1] for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids
+            ]
+            uncond_weights = [[1.0] * len(token) for token in uncond_tokens]
+    # round up the longest length of tokens to a multiple of (model_max_length - 2)
+    max_length = max([len(token) for token in prompt_tokens])
+    if uncond_prompt is not None:
+        max_length = max(max_length, max([len(token) for token in uncond_tokens]))
+    max_embeddings_multiples = min(
+        max_embeddings_multiples,
+        (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1,
+    )
+    max_embeddings_multiples = max(1, max_embeddings_multiples)
+    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
+    # pad the length of tokens and weights
+    bos = pipe.tokenizer.bos_token_id
+    eos = pipe.tokenizer.eos_token_id
+    pad = pipe.tokenizer.pad_token_id
+    prompt_tokens, prompt_weights = pad_tokens_and_weights(
+        prompt_tokens,
+        prompt_weights,
+        max_length,
+        bos,
+        eos,
+        no_boseos_middle=no_boseos_middle,
+        chunk_length=pipe.tokenizer.model_max_length,
+    )
+    prompt_tokens = torch.tensor(prompt_tokens, dtype=torch.long, device=pipe.device)
+    if uncond_prompt is not None:
+        uncond_tokens, uncond_weights = pad_tokens_and_weights(
+            uncond_tokens,
+            uncond_weights,
+            max_length,
+            bos,
+            eos,
+            no_boseos_middle=no_boseos_middle,
+            chunk_length=pipe.tokenizer.model_max_length,
+        )
+        uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=pipe.device)
+    # get the embeddings
+    text_embeddings = get_unweighted_text_embeddings(
+        pipe,
+        prompt_tokens,
+        pipe.tokenizer.model_max_length,
+        clip_skip,
+        eos,
+        pad,
+        no_boseos_middle=no_boseos_middle,
+    )
+    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=pipe.device)
+    if uncond_prompt is not None:
+        uncond_embeddings = get_unweighted_text_embeddings(
+            pipe,
+            uncond_tokens,
+            pipe.tokenizer.model_max_length,
+            clip_skip,
+            eos,
+            pad,
+            no_boseos_middle=no_boseos_middle,
+        )
+        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=pipe.device)
+    # assign weights to the prompts and normalize in the sense of mean
+    # TODO: should we normalize by chunk or in a whole (current implementation)?
+    if (not skip_parsing) and (not skip_weighting):
+        previous_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        text_embeddings *= prompt_weights.unsqueeze(-1)
+        current_mean = text_embeddings.float().mean(axis=[-2, -1]).to(text_embeddings.dtype)
+        text_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+        if uncond_prompt is not None:
+            previous_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
+            uncond_embeddings *= uncond_weights.unsqueeze(-1)
+            current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
+            uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
+    if uncond_prompt is not None:
+        return text_embeddings, uncond_embeddings
+    return text_embeddings, None
+def preprocess_image(image):
+    w, h = image.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+def preprocess_mask(mask, scale_factor=8):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    mask = torch.from_numpy(mask)
+    return mask
+class StableDiffusionLongPromptWeightingPipeline(StableDiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
+    weighting in prompt.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    # if version.parse(version.parse(diffusers.__version__).base_version) >= version.parse("0.9.0"):
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: SchedulerMixin,
+        clip_skip: int,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            requires_safety_checker=requires_safety_checker,
+        )
+        self.clip_skip = clip_skip
+        self.__init__additional__()
+    # else:
+    #     def __init__(
+    #         self,
+    #         vae: AutoencoderKL,
+    #         text_encoder: CLIPTextModel,
+    #         tokenizer: CLIPTokenizer,
+    #         unet: UNet2DConditionModel,
+    #         scheduler: SchedulerMixin,
+    #         safety_checker: StableDiffusionSafetyChecker,
+    #         feature_extractor: CLIPFeatureExtractor,
+    #     ):
+    #         super().__init__(
+    #             vae=vae,
+    #             text_encoder=text_encoder,
+    #             tokenizer=tokenizer,
+    #             unet=unet,
+    #             scheduler=scheduler,
+    #             safety_checker=safety_checker,
+    #             feature_extractor=feature_extractor,
+    #         )
+    #         self.__init__additional__()
+    def __init__additional__(self):
+        if not hasattr(self, "vae_scale_factor"):
+            setattr(self, "vae_scale_factor", 2 ** (len(self.vae.config.block_out_channels) - 1))
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+        max_embeddings_multiples,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        if negative_prompt is None:
+            negative_prompt = [""] * batch_size
+        elif isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt] * batch_size
+        if batch_size != len(negative_prompt):
+            raise ValueError(
+                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                " the batch size of `prompt`."
+            )
+        text_embeddings, uncond_embeddings = get_weighted_text_embeddings(
+            pipe=self,
+            prompt=prompt,
+            uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
+            max_embeddings_multiples=max_embeddings_multiples,
+            clip_skip=self.clip_skip,
+        )
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            bs_embed, seq_len, _ = uncond_embeddings.shape
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def check_inputs(self, prompt, height, width, strength, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if height % 8 != 0 or width % 8 != 0:
+            print(height, width)
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type" f" {type(callback_steps)}."
+            )
+    def get_timesteps(self, num_inference_steps, strength, device, is_text2img):
+        if is_text2img:
+            return self.scheduler.timesteps.to(device), num_inference_steps
+        else:
+            # get the original timestep using init_timestep
+            offset = self.scheduler.config.get("steps_offset", 0)
+            init_timestep = int(num_inference_steps * strength) + offset
+            init_timestep = min(init_timestep, num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep + offset, 0)
+            timesteps = self.scheduler.timesteps[t_start:].to(device)
+            return timesteps, num_inference_steps - t_start
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values.to(dtype))
+        else:
+            has_nsfw_concept = None
+        return image, has_nsfw_concept
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def prepare_latents(self, image, timestep, batch_size, height, width, dtype, device, generator, latents=None):
+        if image is None:
+            shape = (
+                batch_size,
+                self.unet.in_channels,
+                height // self.vae_scale_factor,
+                width // self.vae_scale_factor,
+            )
+            if latents is None:
+                if device.type == "mps":
+                    # randn does not work reproducibly on mps
+                    latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+                else:
+                    latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                if latents.shape != shape:
+                    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+                latents = latents.to(device)
+            # scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+            return latents, None, None
+        else:
+            init_latent_dist = self.vae.encode(image).latent_dist
+            init_latents = init_latent_dist.sample(generator=generator)
+            init_latents = 0.18215 * init_latents
+            init_latents = torch.cat([init_latents] * batch_size, dim=0)
+            init_latents_orig = init_latents
+            shape = init_latents.shape
+            # add noise to latents using the timesteps
+            if device.type == "mps":
+                noise = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
+            else:
+                noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep)
+            return latents, init_latents_orig, noise
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            `None` if cancelled by `is_cancelled_callback`,
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, strength, callback_steps)
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 3. Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            max_embeddings_multiples,
+        )
+        dtype = text_embeddings.dtype
+        # 4. Preprocess image and mask
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess_image(image)
+        if image is not None:
+            image = image.to(device=self.device, dtype=dtype)
+        if isinstance(mask_image, PIL.Image.Image):
+            mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
+        if mask_image is not None:
+            mask = mask_image.to(device=self.device, dtype=dtype)
+            mask = torch.cat([mask] * batch_size * num_images_per_prompt)
+        else:
+            mask = None
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device, image is None)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # 6. Prepare latent variables
+        latents, init_latents_orig, noise = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 8. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            if mask is not None:
+                # masking
+                init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+            # call the callback, if provided
+            if i % callback_steps == 0:
+                if callback is not None:
+                    callback(i, t, latents)
+                if is_cancelled_callback is not None and is_cancelled_callback():
+                    return None
+        # 9. Post-processing
+        image = self.decode_latents(latents)
+        # 10. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
+        # 11. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        if not return_dict:
+            return image, has_nsfw_concept
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    def text2img(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function for text-to-image generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+        )
+    def img2img(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function for image-to-image generation.
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
+                noise will be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+        )
+    def inpaint(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        max_embeddings_multiples: Optional[int] = 3,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        is_cancelled_callback: Optional[Callable[[], bool]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function for inpaint.
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
+                noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            max_embeddings_multiples (`int`, *optional*, defaults to `3`):
+                The max multiple length of prompt embeddings compared to the max output length of text encoder.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            is_cancelled_callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. If the function returns
+                `True`, the inference will be cancelled.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        return self.__call__(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            image=image,
+            mask_image=mask_image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            strength=strength,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            max_embeddings_multiples=max_embeddings_multiples,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            is_cancelled_callback=is_cancelled_callback,
+            callback_steps=callback_steps,
+        )

library/merge_lora_gui.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import gradio as gr
+from easygui import msgbox
+import subprocess
+import os
+from .common_gui import (
+    get_saveasfilename_path,
+    get_any_file_path,
+    get_file_path,
+)
+folder_symbol = '\U0001f4c2'  # 📂
+refresh_symbol = '\U0001f504'  # 🔄
+save_style_symbol = '\U0001f4be'  # 💾
+document_symbol = '\U0001F4C4'   # 📄
+PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
+def merge_lora(
+    lora_a_model,
+    lora_b_model,
+    ratio,
+    save_to,
+    precision,
+    save_precision,
+):
+    # Check for caption_text_input
+    if lora_a_model == '':
+        msgbox('Invalid model A file')
+        return
+    if lora_b_model == '':
+        msgbox('Invalid model B file')
+        return
+    # Check if source model exist
+    if not os.path.isfile(lora_a_model):
+        msgbox('The provided model A is not a file')
+        return
+    if not os.path.isfile(lora_b_model):
+        msgbox('The provided model B is not a file')
+        return
+    ratio_a = ratio
+    ratio_b = 1 - ratio
+    run_cmd = f'{PYTHON} "{os.path.join("networks","merge_lora.py")}"'
+    run_cmd += f' --save_precision {save_precision}'
+    run_cmd += f' --precision {precision}'
+    run_cmd += f' --save_to "{save_to}"'
+    run_cmd += f' --models "{lora_a_model}" "{lora_b_model}"'
+    run_cmd += f' --ratios {ratio_a} {ratio_b}'
+    print(run_cmd)
+    # Run the command
+    if os.name == 'posix':
+        os.system(run_cmd)
+    else:
+        subprocess.run(run_cmd)
+###
+# Gradio UI
+###
+def gradio_merge_lora_tab():
+    with gr.Tab('Merge LoRA'):
+        gr.Markdown('This utility can merge two LoRA networks together.')
+        lora_ext = gr.Textbox(value='*.safetensors *.pt', visible=False)
+        lora_ext_name = gr.Textbox(value='LoRA model types', visible=False)
+        with gr.Row():
+            lora_a_model = gr.Textbox(
+                label='LoRA model "A"',
+                placeholder='Path to the LoRA A model',
+                interactive=True,
+            )
+            button_lora_a_model_file = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_lora_a_model_file.click(
+                get_file_path,
+                inputs=[lora_a_model, lora_ext, lora_ext_name],
+                outputs=lora_a_model,
+                show_progress=False,
+            )
+            lora_b_model = gr.Textbox(
+                label='LoRA model "B"',
+                placeholder='Path to the LoRA B model',
+                interactive=True,
+            )
+            button_lora_b_model_file = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_lora_b_model_file.click(
+                get_file_path,
+                inputs=[lora_b_model, lora_ext, lora_ext_name],
+                outputs=lora_b_model,
+                show_progress=False,
+            )
+        with gr.Row():
+            ratio = gr.Slider(
+                label='Merge ratio (eg: 0.7 mean 70% of model A and 30% of model B',
+                minimum=0,
+                maximum=1,
+                step=0.01,
+                value=0.5,
+                interactive=True,
+            )
+        with gr.Row():
+            save_to = gr.Textbox(
+                label='Save to',
+                placeholder='path for the file to save...',
+                interactive=True,
+            )
+            button_save_to = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_save_to.click(
+                get_saveasfilename_path,
+                inputs=[save_to, lora_ext, lora_ext_name],
+                outputs=save_to,
+                show_progress=False,
+            )
+            precision = gr.Dropdown(
+                label='Merge precision',
+                choices=['fp16', 'bf16', 'float'],
+                value='float',
+                interactive=True,
+            )
+            save_precision = gr.Dropdown(
+                label='Save precision',
+                choices=['fp16', 'bf16', 'float'],
+                value='float',
+                interactive=True,
+            )
+        convert_button = gr.Button('Merge model')
+        convert_button.click(
+            merge_lora,
+            inputs=[
+                lora_a_model,
+                lora_b_model,
+                ratio,
+                save_to,
+                precision,
+                save_precision,
+            ],
+            show_progress=False,
+        )

library/model_util.py ADDED Viewed

	@@ -0,0 +1,1165 @@

+# v1: split from train_db_fixed.py.
+# v2: support safetensors
+import math
+import os
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextConfig, logging
+from diffusers import AutoencoderKL, DDIMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from safetensors.torch import load_file, save_file
+# DiffUsers版StableDiffusionのモデルパラメータ
+NUM_TRAIN_TIMESTEPS = 1000
+BETA_START = 0.00085
+BETA_END = 0.0120
+UNET_PARAMS_MODEL_CHANNELS = 320
+UNET_PARAMS_CHANNEL_MULT = [1, 2, 4, 4]
+UNET_PARAMS_ATTENTION_RESOLUTIONS = [4, 2, 1]
+UNET_PARAMS_IMAGE_SIZE = 64  # fixed from old invalid value `32`
+UNET_PARAMS_IN_CHANNELS = 4
+UNET_PARAMS_OUT_CHANNELS = 4
+UNET_PARAMS_NUM_RES_BLOCKS = 2
+UNET_PARAMS_CONTEXT_DIM = 768
+UNET_PARAMS_NUM_HEADS = 8
+VAE_PARAMS_Z_CHANNELS = 4
+VAE_PARAMS_RESOLUTION = 256
+VAE_PARAMS_IN_CHANNELS = 3
+VAE_PARAMS_OUT_CH = 3
+VAE_PARAMS_CH = 128
+VAE_PARAMS_CH_MULT = [1, 2, 4, 4]
+VAE_PARAMS_NUM_RES_BLOCKS = 2
+# V2
+V2_UNET_PARAMS_ATTENTION_HEAD_DIM = [5, 10, 20, 20]
+V2_UNET_PARAMS_CONTEXT_DIM = 1024
+# Diffusersの設定を読み込むための参照モデル
+DIFFUSERS_REF_MODEL_ID_V1 = "runwayml/stable-diffusion-v1-5"
+DIFFUSERS_REF_MODEL_ID_V2 = "stabilityai/stable-diffusion-2-1"
+# region StableDiffusion->Diffusersの変換コード
+# convert_original_stable_diffusion_to_diffusers をコピーして修正している（ASL 2.0）
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+        mapping.append({"old": old_item, "new": new_item})
+    return mapping
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming
+    to them. It splits attention layers, and takes into account additional replacements
+    that may arise.
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+    for path in paths:
+        new_path = path["new"]
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+def linear_transformer_to_conv(checkpoint):
+    keys = list(checkpoint.keys())
+    tf_keys = ["proj_in.weight", "proj_out.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in tf_keys:
+            if checkpoint[key].ndim == 2:
+                checkpoint[key] = checkpoint[key].unsqueeze(2).unsqueeze(2)
+def convert_ldm_unet_checkpoint(v2, checkpoint, config):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    unet_key = "model.diffusion_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+    new_checkpoint = {}
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}." in key] for layer_id in range(num_input_blocks)
+    }
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}." in key] for layer_id in range(num_middle_blocks)
+    }
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}." in key] for layer_id in range(num_output_blocks)
+    }
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+        resnets = [key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(f"input_blocks.{i}.0.op.bias")
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+            # オリジナル：
+            # if ["conv.weight", "conv.bias"] in output_block_list.values():
+            #   index = list(output_block_list.values()).index(["conv.weight", "conv.bias"])
+            # biasとweightの順番に依存しないようにする：もっといいやり方がありそうだが
+            for l in output_block_list.values():
+                l.sort()
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config)
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+    # SDのv2では1*1のconv2dがlinearに変わっているので、linear->convに変換する
+    if v2:
+        linear_transformer_to_conv(new_checkpoint)
+    return new_checkpoint
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    vae_key = "first_stage_model."
+    keys = list(checkpoint.keys())
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+    # if len(vae_state_dict) == 0:
+    #   # 渡されたcheckpointは.ckptから読み込んだcheckpointではなくvaeのstate_dict
+    #   vae_state_dict = checkpoint
+    new_checkpoint = {}
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)}
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)}
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key]
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+def create_unet_diffusers_config(v2):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    # unet_params = original_config.model.params.unet_config.params
+    block_out_channels = [UNET_PARAMS_MODEL_CHANNELS * mult for mult in UNET_PARAMS_CHANNEL_MULT]
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in UNET_PARAMS_ATTENTION_RESOLUTIONS else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+    config = dict(
+        sample_size=UNET_PARAMS_IMAGE_SIZE,
+        in_channels=UNET_PARAMS_IN_CHANNELS,
+        out_channels=UNET_PARAMS_OUT_CHANNELS,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        layers_per_block=UNET_PARAMS_NUM_RES_BLOCKS,
+        cross_attention_dim=UNET_PARAMS_CONTEXT_DIM if not v2 else V2_UNET_PARAMS_CONTEXT_DIM,
+        attention_head_dim=UNET_PARAMS_NUM_HEADS if not v2 else V2_UNET_PARAMS_ATTENTION_HEAD_DIM,
+    )
+    return config
+def create_vae_diffusers_config():
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    # vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    # _ = original_config.model.params.first_stage_config.params.embed_dim
+    block_out_channels = [VAE_PARAMS_CH * mult for mult in VAE_PARAMS_CH_MULT]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+    config = dict(
+        sample_size=VAE_PARAMS_RESOLUTION,
+        in_channels=VAE_PARAMS_IN_CHANNELS,
+        out_channels=VAE_PARAMS_OUT_CH,
+        down_block_types=tuple(down_block_types),
+        up_block_types=tuple(up_block_types),
+        block_out_channels=tuple(block_out_channels),
+        latent_channels=VAE_PARAMS_Z_CHANNELS,
+        layers_per_block=VAE_PARAMS_NUM_RES_BLOCKS,
+    )
+    return config
+def convert_ldm_clip_checkpoint_v1(checkpoint):
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+    return text_model_dict
+def convert_ldm_clip_checkpoint_v2(checkpoint, max_length):
+    # 嫌になるくらい違うぞ！
+    def convert_key(key):
+        if not key.startswith("cond_stage_model"):
+            return None
+        # common conversion
+        key = key.replace("cond_stage_model.model.transformer.", "text_model.encoder.")
+        key = key.replace("cond_stage_model.model.", "text_model.")
+        if "resblocks" in key:
+            # resblocks conversion
+            key = key.replace(".resblocks.", ".layers.")
+            if ".ln_" in key:
+                key = key.replace(".ln_", ".layer_norm")
+            elif ".mlp." in key:
+                key = key.replace(".c_fc.", ".fc1.")
+                key = key.replace(".c_proj.", ".fc2.")
+            elif ".attn.out_proj" in key:
+                key = key.replace(".attn.out_proj.", ".self_attn.out_proj.")
+            elif ".attn.in_proj" in key:
+                key = None  # 特殊なので後で処理する
+            else:
+                raise ValueError(f"unexpected key in SD: {key}")
+        elif ".positional_embedding" in key:
+            key = key.replace(".positional_embedding", ".embeddings.position_embedding.weight")
+        elif ".text_projection" in key:
+            key = None  # 使われない???
+        elif ".logit_scale" in key:
+            key = None  # 使われない???
+        elif ".token_embedding" in key:
+            key = key.replace(".token_embedding.weight", ".embeddings.token_embedding.weight")
+        elif ".ln_final" in key:
+            key = key.replace(".ln_final", ".final_layer_norm")
+        return key
+    keys = list(checkpoint.keys())
+    new_sd = {}
+    for key in keys:
+        # remove resblocks 23
+        if ".resblocks.23." in key:
+            continue
+        new_key = convert_key(key)
+        if new_key is None:
+            continue
+        new_sd[new_key] = checkpoint[key]
+    # attnの変換
+    for key in keys:
+        if ".resblocks.23." in key:
+            continue
+        if ".resblocks" in key and ".attn.in_proj_" in key:
+            # 三つに分割
+            values = torch.chunk(checkpoint[key], 3)
+            key_suffix = ".weight" if "weight" in key else ".bias"
+            key_pfx = key.replace("cond_stage_model.model.transformer.resblocks.", "text_model.encoder.layers.")
+            key_pfx = key_pfx.replace("_weight", "")
+            key_pfx = key_pfx.replace("_bias", "")
+            key_pfx = key_pfx.replace(".attn.in_proj", ".self_attn.")
+            new_sd[key_pfx + "q_proj" + key_suffix] = values[0]
+            new_sd[key_pfx + "k_proj" + key_suffix] = values[1]
+            new_sd[key_pfx + "v_proj" + key_suffix] = values[2]
+    # rename or add position_ids
+    ANOTHER_POSITION_IDS_KEY = "text_model.encoder.text_model.embeddings.position_ids"
+    if ANOTHER_POSITION_IDS_KEY in new_sd:
+        # waifu diffusion v1.4
+        position_ids = new_sd[ANOTHER_POSITION_IDS_KEY]
+        del new_sd[ANOTHER_POSITION_IDS_KEY]
+    else:
+        position_ids = torch.Tensor([list(range(max_length))]).to(torch.int64)
+    new_sd["text_model.embeddings.position_ids"] = position_ids
+    return new_sd
+# endregion
+# region Diffusers->StableDiffusion の変換コード
+# convert_diffusers_to_original_stable_diffusion をコピーして修正している（ASL 2.0）
+def conv_transformer_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    tf_keys = ["proj_in.weight", "proj_out.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in tf_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+def convert_unet_state_dict_to_sd(v2, unet_state_dict):
+    unet_conversion_map = [
+        # (stable-diffusion, HF Diffusers)
+        ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+        ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+        ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+        ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+        ("input_blocks.0.0.weight", "conv_in.weight"),
+        ("input_blocks.0.0.bias", "conv_in.bias"),
+        ("out.0.weight", "conv_norm_out.weight"),
+        ("out.0.bias", "conv_norm_out.bias"),
+        ("out.2.weight", "conv_out.weight"),
+        ("out.2.bias", "conv_out.bias"),
+    ]
+    unet_conversion_map_resnet = [
+        # (stable-diffusion, HF Diffusers)
+        ("in_layers.0", "norm1"),
+        ("in_layers.2", "conv1"),
+        ("out_layers.0", "norm2"),
+        ("out_layers.3", "conv2"),
+        ("emb_layers.1", "time_emb_proj"),
+        ("skip_connection", "conv_shortcut"),
+    ]
+    unet_conversion_map_layer = []
+    for i in range(4):
+        # loop over downblocks/upblocks
+        for j in range(2):
+            # loop over resnets/attentions for downblocks
+            hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+            sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+            unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+            if i < 3:
+                # no attention layers in down_blocks.3
+                hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+                sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+                unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+        for j in range(3):
+            # loop over resnets/attentions for upblocks
+            hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+            sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+            unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+            if i > 0:
+                # no attention layers in up_blocks.0
+                hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+                sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
+                unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+        if i < 3:
+            # no downsample in down_blocks.3
+            hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+            sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+            unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+            # no upsample in up_blocks.3
+            hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+            sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
+            unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+    hf_mid_atn_prefix = "mid_block.attentions.0."
+    sd_mid_atn_prefix = "middle_block.1."
+    unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+    for j in range(2):
+        hf_mid_res_prefix = f"mid_block.resnets.{j}."
+        sd_mid_res_prefix = f"middle_block.{2*j}."
+        unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    if v2:
+        conv_transformer_to_linear(new_state_dict)
+    return new_state_dict
+# ================#
+# VAE Conversion #
+# ================#
+def reshape_weight_for_sd(w):
+    # convert HF linear weights to SD conv2d weights
+    return w.reshape(*w.shape, 1, 1)
+def convert_vae_state_dict(vae_state_dict):
+    vae_conversion_map = [
+        # (stable-diffusion, HF Diffusers)
+        ("nin_shortcut", "conv_shortcut"),
+        ("norm_out", "conv_norm_out"),
+        ("mid.attn_1.", "mid_block.attentions.0."),
+    ]
+    for i in range(4):
+        # down_blocks have two resnets
+        for j in range(2):
+            hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
+            sd_down_prefix = f"encoder.down.{i}.block.{j}."
+            vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
+        if i < 3:
+            hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
+            sd_downsample_prefix = f"down.{i}.downsample."
+            vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
+            hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+            sd_upsample_prefix = f"up.{3-i}.upsample."
+            vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
+        # up_blocks have three resnets
+        # also, up blocks in hf are numbered in reverse from sd
+        for j in range(3):
+            hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
+            sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
+            vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
+    # this part accounts for mid blocks in both the encoder and the decoder
+    for i in range(2):
+        hf_mid_res_prefix = f"mid_block.resnets.{i}."
+        sd_mid_res_prefix = f"mid.block_{i+1}."
+        vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
+    vae_conversion_map_attn = [
+        # (stable-diffusion, HF Diffusers)
+        ("norm.", "group_norm."),
+        ("q.", "query."),
+        ("k.", "key."),
+        ("v.", "value."),
+        ("proj_out.", "proj_attn."),
+    ]
+    mapping = {k: k for k in vae_state_dict.keys()}
+    for k, v in mapping.items():
+        for sd_part, hf_part in vae_conversion_map:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    for k, v in mapping.items():
+        if "attentions" in k:
+            for sd_part, hf_part in vae_conversion_map_attn:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+    weights_to_convert = ["q", "k", "v", "proj_out"]
+    for k, v in new_state_dict.items():
+        for weight_name in weights_to_convert:
+            if f"mid.attn_1.{weight_name}.weight" in k:
+                # print(f"Reshaping {k} for SD format")
+                new_state_dict[k] = reshape_weight_for_sd(v)
+    return new_state_dict
+# endregion
+# region 自作のモデル読み書きなど
+def is_safetensors(path):
+    return os.path.splitext(path)[1].lower() == ".safetensors"
+def load_checkpoint_with_text_encoder_conversion(ckpt_path, device="cpu"):
+    # text encoderの格納形式が違うモデルに対応する ('text_model'がない)
+    TEXT_ENCODER_KEY_REPLACEMENTS = [
+        ("cond_stage_model.transformer.embeddings.", "cond_stage_model.transformer.text_model.embeddings."),
+        ("cond_stage_model.transformer.encoder.", "cond_stage_model.transformer.text_model.encoder."),
+        ("cond_stage_model.transformer.final_layer_norm.", "cond_stage_model.transformer.text_model.final_layer_norm."),
+    ]
+    if is_safetensors(ckpt_path):
+        checkpoint = None
+        state_dict = load_file(ckpt_path)  # , device) # may causes error
+    else:
+        checkpoint = torch.load(ckpt_path, map_location=device)
+        if "state_dict" in checkpoint:
+            state_dict = checkpoint["state_dict"]
+        else:
+            state_dict = checkpoint
+            checkpoint = None
+    key_reps = []
+    for rep_from, rep_to in TEXT_ENCODER_KEY_REPLACEMENTS:
+        for key in state_dict.keys():
+            if key.startswith(rep_from):
+                new_key = rep_to + key[len(rep_from) :]
+                key_reps.append((key, new_key))
+    for key, new_key in key_reps:
+        state_dict[new_key] = state_dict[key]
+        del state_dict[key]
+    return checkpoint, state_dict
+# TODO dtype指定の動作が怪しいので確認する text_encoderを指定形式で作れるか未確認
+def load_models_from_stable_diffusion_checkpoint(v2, ckpt_path, device="cpu", dtype=None):
+    _, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path, device)
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(v2)
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(v2, state_dict, unet_config)
+    unet = UNet2DConditionModel(**unet_config).to(device)
+    info = unet.load_state_dict(converted_unet_checkpoint)
+    print("loading u-net:", info)
+    # Convert the VAE model.
+    vae_config = create_vae_diffusers_config()
+    converted_vae_checkpoint = convert_ldm_vae_checkpoint(state_dict, vae_config)
+    vae = AutoencoderKL(**vae_config).to(device)
+    info = vae.load_state_dict(converted_vae_checkpoint)
+    print("loading vae:", info)
+    # convert text_model
+    if v2:
+        converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v2(state_dict, 77)
+        cfg = CLIPTextConfig(
+            vocab_size=49408,
+            hidden_size=1024,
+            intermediate_size=4096,
+            num_hidden_layers=23,
+            num_attention_heads=16,
+            max_position_embeddings=77,
+            hidden_act="gelu",
+            layer_norm_eps=1e-05,
+            dropout=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=1.0,
+            pad_token_id=1,
+            bos_token_id=0,
+            eos_token_id=2,
+            model_type="clip_text_model",
+            projection_dim=512,
+            torch_dtype="float32",
+            transformers_version="4.25.0.dev0",
+        )
+        text_model = CLIPTextModel._from_config(cfg)
+        info = text_model.load_state_dict(converted_text_encoder_checkpoint)
+    else:
+        converted_text_encoder_checkpoint = convert_ldm_clip_checkpoint_v1(state_dict)
+        logging.set_verbosity_error()  # don't show annoying warning
+        text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
+        logging.set_verbosity_warning()
+        info = text_model.load_state_dict(converted_text_encoder_checkpoint)
+    print("loading text encoder:", info)
+    return text_model, vae, unet
+def convert_text_encoder_state_dict_to_sd_v2(checkpoint, make_dummy_weights=False):
+    def convert_key(key):
+        # position_idsの除去
+        if ".position_ids" in key:
+            return None
+        # common
+        key = key.replace("text_model.encoder.", "transformer.")
+        key = key.replace("text_model.", "")
+        if "layers" in key:
+            # resblocks conversion
+            key = key.replace(".layers.", ".resblocks.")
+            if ".layer_norm" in key:
+                key = key.replace(".layer_norm", ".ln_")
+            elif ".mlp." in key:
+                key = key.replace(".fc1.", ".c_fc.")
+                key = key.replace(".fc2.", ".c_proj.")
+            elif ".self_attn.out_proj" in key:
+                key = key.replace(".self_attn.out_proj.", ".attn.out_proj.")
+            elif ".self_attn." in key:
+                key = None  # 特殊なので後で処理する
+            else:
+                raise ValueError(f"unexpected key in DiffUsers model: {key}")
+        elif ".position_embedding" in key:
+            key = key.replace("embeddings.position_embedding.weight", "positional_embedding")
+        elif ".token_embedding" in key:
+            key = key.replace("embeddings.token_embedding.weight", "token_embedding.weight")
+        elif "final_layer_norm" in key:
+            key = key.replace("final_layer_norm", "ln_final")
+        return key
+    keys = list(checkpoint.keys())
+    new_sd = {}
+    for key in keys:
+        new_key = convert_key(key)
+        if new_key is None:
+            continue
+        new_sd[new_key] = checkpoint[key]
+    # attnの変換
+    for key in keys:
+        if "layers" in key and "q_proj" in key:
+            # 三つを結合
+            key_q = key
+            key_k = key.replace("q_proj", "k_proj")
+            key_v = key.replace("q_proj", "v_proj")
+            value_q = checkpoint[key_q]
+            value_k = checkpoint[key_k]
+            value_v = checkpoint[key_v]
+            value = torch.cat([value_q, value_k, value_v])
+            new_key = key.replace("text_model.encoder.layers.", "transformer.resblocks.")
+            new_key = new_key.replace(".self_attn.q_proj.", ".attn.in_proj_")
+            new_sd[new_key] = value
+    # 最後の層などを捏造するか
+    if make_dummy_weights:
+        print("make dummy weights for resblock.23, text_projection and logit scale.")
+        keys = list(new_sd.keys())
+        for key in keys:
+            if key.startswith("transformer.resblocks.22."):
+                new_sd[key.replace(".22.", ".23.")] = new_sd[key].clone()  # copyしないとsafetensorsの保存で落ちる
+        # Diffusersに含まれない重みを作っておく
+        new_sd["text_projection"] = torch.ones((1024, 1024), dtype=new_sd[keys[0]].dtype, device=new_sd[keys[0]].device)
+        new_sd["logit_scale"] = torch.tensor(1)
+    return new_sd
+def save_stable_diffusion_checkpoint(v2, output_file, text_encoder, unet, ckpt_path, epochs, steps, save_dtype=None, vae=None):
+    if ckpt_path is not None:
+        # epoch/stepを参照する。またVAEがメモリ上にないときなど、もう一度VAEを含めて読み込む
+        checkpoint, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path)
+        if checkpoint is None:  # safetensors または state_dictのckpt
+            checkpoint = {}
+            strict = False
+        else:
+            strict = True
+        if "state_dict" in state_dict:
+            del state_dict["state_dict"]
+    else:
+        # 新しく作る
+        assert vae is not None, "VAE is required to save a checkpoint without a given checkpoint"
+        checkpoint = {}
+        state_dict = {}
+        strict = False
+    def update_sd(prefix, sd):
+        for k, v in sd.items():
+            key = prefix + k
+            assert not strict or key in state_dict, f"Illegal key in save SD: {key}"
+            if save_dtype is not None:
+                v = v.detach().clone().to("cpu").to(save_dtype)
+            state_dict[key] = v
+    # Convert the UNet model
+    unet_state_dict = convert_unet_state_dict_to_sd(v2, unet.state_dict())
+    update_sd("model.diffusion_model.", unet_state_dict)
+    # Convert the text encoder model
+    if v2:
+        make_dummy = ckpt_path is None  # 参照元のcheckpointがない場合は最後の層を前の層から複製して作るなどダミーの重みを入れる
+        text_enc_dict = convert_text_encoder_state_dict_to_sd_v2(text_encoder.state_dict(), make_dummy)
+        update_sd("cond_stage_model.model.", text_enc_dict)
+    else:
+        text_enc_dict = text_encoder.state_dict()
+        update_sd("cond_stage_model.transformer.", text_enc_dict)
+    # Convert the VAE
+    if vae is not None:
+        vae_dict = convert_vae_state_dict(vae.state_dict())
+        update_sd("first_stage_model.", vae_dict)
+    # Put together new checkpoint
+    key_count = len(state_dict.keys())
+    new_ckpt = {"state_dict": state_dict}
+    # epoch and global_step are sometimes not int
+    try:
+        if "epoch" in checkpoint:
+            epochs += checkpoint["epoch"]
+        if "global_step" in checkpoint:
+            steps += checkpoint["global_step"]
+    except:
+        pass
+    new_ckpt["epoch"] = epochs
+    new_ckpt["global_step"] = steps
+    if is_safetensors(output_file):
+        # TODO Tensor以外のdictの値を削除したほうがいいか
+        save_file(state_dict, output_file)
+    else:
+        torch.save(new_ckpt, output_file)
+    return key_count
+def save_diffusers_checkpoint(v2, output_dir, text_encoder, unet, pretrained_model_name_or_path, vae=None, use_safetensors=False):
+    if pretrained_model_name_or_path is None:
+        # load default settings for v1/v2
+        if v2:
+            pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V2
+        else:
+            pretrained_model_name_or_path = DIFFUSERS_REF_MODEL_ID_V1
+    scheduler = DDIMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder="tokenizer")
+    if vae is None:
+        vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
+    pipeline = StableDiffusionPipeline(
+        unet=unet,
+        text_encoder=text_encoder,
+        vae=vae,
+        scheduler=scheduler,
+        tokenizer=tokenizer,
+        safety_checker=None,
+        feature_extractor=None,
+        requires_safety_checker=None,
+    )
+    pipeline.save_pretrained(output_dir, safe_serialization=use_safetensors)
+VAE_PREFIX = "first_stage_model."
+def load_vae(vae_id, dtype):
+    print(f"load VAE: {vae_id}")
+    if os.path.isdir(vae_id) or not os.path.isfile(vae_id):
+        # Diffusers local/remote
+        try:
+            vae = AutoencoderKL.from_pretrained(vae_id, subfolder=None, torch_dtype=dtype)
+        except EnvironmentError as e:
+            print(f"exception occurs in loading vae: {e}")
+            print("retry with subfolder='vae'")
+            vae = AutoencoderKL.from_pretrained(vae_id, subfolder="vae", torch_dtype=dtype)
+        return vae
+    # local
+    vae_config = create_vae_diffusers_config()
+    if vae_id.endswith(".bin"):
+        # SD 1.5 VAE on Huggingface
+        converted_vae_checkpoint = torch.load(vae_id, map_location="cpu")
+    else:
+        # StableDiffusion
+        vae_model = load_file(vae_id, "cpu") if is_safetensors(vae_id) else torch.load(vae_id, map_location="cpu")
+        vae_sd = vae_model["state_dict"] if "state_dict" in vae_model else vae_model
+        # vae only or full model
+        full_model = False
+        for vae_key in vae_sd:
+            if vae_key.startswith(VAE_PREFIX):
+                full_model = True
+                break
+        if not full_model:
+            sd = {}
+            for key, value in vae_sd.items():
+                sd[VAE_PREFIX + key] = value
+            vae_sd = sd
+            del sd
+        # Convert the VAE model.
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(vae_sd, vae_config)
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(converted_vae_checkpoint)
+    return vae
+# endregion
+def make_bucket_resolutions(max_reso, min_size=256, max_size=1024, divisible=64):
+    max_width, max_height = max_reso
+    max_area = (max_width // divisible) * (max_height // divisible)
+    resos = set()
+    size = int(math.sqrt(max_area)) * divisible
+    resos.add((size, size))
+    size = min_size
+    while size <= max_size:
+        width = size
+        height = min(max_size, (max_area // (width // divisible)) * divisible)
+        resos.add((width, height))
+        resos.add((height, width))
+        # # make additional resos
+        # if width >= height and width - divisible >= min_size:
+        #   resos.add((width - divisible, height))
+        #   resos.add((height, width - divisible))
+        # if height >= width and height - divisible >= min_size:
+        #   resos.add((width, height - divisible))
+        #   resos.add((height - divisible, width))
+        size += divisible
+    resos = list(resos)
+    resos.sort()
+    return resos
+if __name__ == "__main__":
+    resos = make_bucket_resolutions((512, 768))
+    print(len(resos))
+    print(resos)
+    aspect_ratios = [w / h for w, h in resos]
+    print(aspect_ratios)
+    ars = set()
+    for ar in aspect_ratios:
+        if ar in ars:
+            print("error! duplicate ar:", ar)
+        ars.add(ar)

library/resize_lora_gui.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import gradio as gr
+from easygui import msgbox
+import subprocess
+import os
+from .common_gui import get_saveasfilename_path, get_file_path
+PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
+folder_symbol = '\U0001f4c2'  # 📂
+refresh_symbol = '\U0001f504'  # 🔄
+save_style_symbol = '\U0001f4be'  # 💾
+document_symbol = '\U0001F4C4'   # 📄
+def resize_lora(
+    model,
+    new_rank,
+    save_to,
+    save_precision,
+    device,
+    dynamic_method,
+    dynamic_param,
+    verbose,
+):
+    # Check for caption_text_input
+    if model == '':
+        msgbox('Invalid model file')
+        return
+    # Check if source model exist
+    if not os.path.isfile(model):
+        msgbox('The provided model is not a file')
+        return
+    if dynamic_method == 'sv_ratio':
+        if float(dynamic_param) < 2:
+            msgbox(
+                f'Dynamic parameter for {dynamic_method} need to be 2 or greater...'
+            )
+            return
+    if dynamic_method == 'sv_fro' or dynamic_method == 'sv_cumulative':
+        if float(dynamic_param) < 0 or float(dynamic_param) > 1:
+            msgbox(
+                f'Dynamic parameter for {dynamic_method} need to be between 0 and 1...'
+            )
+            return
+    # Check if save_to end with one of the defines extension. If not add .safetensors.
+    if not save_to.endswith(('.pt', '.safetensors')):
+        save_to += '.safetensors'
+    if device == '':
+        device = 'cuda'
+    run_cmd = f'{PYTHON} "{os.path.join("networks","resize_lora.py")}"'
+    run_cmd += f' --save_precision {save_precision}'
+    run_cmd += f' --save_to "{save_to}"'
+    run_cmd += f' --model "{model}"'
+    run_cmd += f' --new_rank {new_rank}'
+    run_cmd += f' --device {device}'
+    if not dynamic_method == 'None':
+        run_cmd += f' --dynamic_method {dynamic_method}'
+        run_cmd += f' --dynamic_param {dynamic_param}'
+    if verbose:
+        run_cmd += f' --verbose'
+    print(run_cmd)
+    # Run the command
+    if os.name == 'posix':
+        os.system(run_cmd)
+    else:
+        subprocess.run(run_cmd)
+###
+# Gradio UI
+###
+def gradio_resize_lora_tab():
+    with gr.Tab('Resize LoRA'):
+        gr.Markdown('This utility can resize a LoRA.')
+        lora_ext = gr.Textbox(value='*.safetensors *.pt', visible=False)
+        lora_ext_name = gr.Textbox(value='LoRA model types', visible=False)
+        with gr.Row():
+            model = gr.Textbox(
+                label='Source LoRA',
+                placeholder='Path to the LoRA to resize',
+                interactive=True,
+            )
+            button_lora_a_model_file = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_lora_a_model_file.click(
+                get_file_path,
+                inputs=[model, lora_ext, lora_ext_name],
+                outputs=model,
+                show_progress=False,
+            )
+        with gr.Row():
+            new_rank = gr.Slider(
+                label='Desired LoRA rank',
+                minimum=1,
+                maximum=1024,
+                step=1,
+                value=4,
+                interactive=True,
+            )
+        with gr.Row():
+            dynamic_method = gr.Dropdown(
+                choices=['None', 'sv_ratio', 'sv_fro', 'sv_cumulative'],
+                value='sv_fro',
+                label='Dynamic method',
+                interactive=True,
+            )
+            dynamic_param = gr.Textbox(
+                label='Dynamic parameter',
+                value='0.9',
+                interactive=True,
+                placeholder='Value for the dynamic method selected.',
+            )
+            verbose = gr.Checkbox(label='Verbose', value=False)
+        with gr.Row():
+            save_to = gr.Textbox(
+                label='Save to',
+                placeholder='path for the LoRA file to save...',
+                interactive=True,
+            )
+            button_save_to = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_save_to.click(
+                get_saveasfilename_path,
+                inputs=[save_to, lora_ext, lora_ext_name],
+                outputs=save_to,
+                show_progress=False,
+            )
+            save_precision = gr.Dropdown(
+                label='Save precision',
+                choices=['fp16', 'bf16', 'float'],
+                value='fp16',
+                interactive=True,
+            )
+            device = gr.Dropdown(
+                label='Device',
+                choices=[
+                    'cpu',
+                    'cuda',
+                ],
+                value='cuda',
+                interactive=True,
+            )
+        convert_button = gr.Button('Resize model')
+        convert_button.click(
+            resize_lora,
+            inputs=[
+                model,
+                new_rank,
+                save_to,
+                save_precision,
+                device,
+                dynamic_method,
+                dynamic_param,
+                verbose,
+            ],
+            show_progress=False,
+        )

library/sampler_gui.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import tempfile
+import os
+import gradio as gr
+from easygui import msgbox
+folder_symbol = '\U0001f4c2'  # 📂
+refresh_symbol = '\U0001f504'  # 🔄
+save_style_symbol = '\U0001f4be'  # 💾
+document_symbol = '\U0001F4C4'   # 📄
+###
+### Gradio common sampler GUI section
+###
+def sample_gradio_config():
+    with gr.Accordion('Sample images config', open=False):
+        with gr.Row():
+            sample_every_n_steps = gr.Number(
+                label='Sample every n steps',
+                value=0,
+                precision=0,
+                interactive=True,
+            )
+            sample_every_n_epochs = gr.Number(
+                label='Sample every n epochs',
+                value=0,
+                precision=0,
+                interactive=True,
+            )
+            sample_sampler = gr.Dropdown(
+                label='Sample sampler',
+                choices=[
+                    'ddim',
+                    'pndm',
+                    'lms',
+                    'euler',
+                    'euler_a',
+                    'heun',
+                    'dpm_2',
+                    'dpm_2_a',
+                    'dpmsolver',
+                    'dpmsolver++',
+                    'dpmsingle',
+                    'k_lms',
+                    'k_euler',
+                    'k_euler_a',
+                    'k_dpm_2',
+                    'k_dpm_2_a',
+                ],
+                value='euler_a',
+                interactive=True,
+            )
+        with gr.Row():
+            sample_prompts = gr.Textbox(
+                lines=5,
+                label='Sample prompts',
+                interactive=True,
+                placeholder='masterpiece, best quality, 1girl, in white shirts, upper body, looking at viewer, simple background --n low quality, worst quality, bad anatomy,bad composition, poor, low effort --w 768 --h 768 --d 1 --l 7.5 --s 28',
+            )
+    return (
+        sample_every_n_steps,
+        sample_every_n_epochs,
+        sample_sampler,
+        sample_prompts,
+    )
+def run_cmd_sample(
+    sample_every_n_steps,
+    sample_every_n_epochs,
+    sample_sampler,
+    sample_prompts,
+    output_dir,
+):
+    output_dir = os.path.join(output_dir, 'sample')
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    run_cmd = ''
+    if sample_every_n_epochs == 0 and sample_every_n_steps == 0:
+        return run_cmd
+    # Create the prompt file and get its path
+    sample_prompts_path = os.path.join(output_dir, 'prompt.txt')
+    with open(sample_prompts_path, 'w') as f:
+        f.write(sample_prompts)
+    run_cmd += f' --sample_sampler={sample_sampler}'
+    run_cmd += f' --sample_prompts="{sample_prompts_path}"'
+    if not sample_every_n_epochs == 0:
+        run_cmd += f' --sample_every_n_epochs="{sample_every_n_epochs}"'
+    if not sample_every_n_steps == 0:
+        run_cmd += f' --sample_every_n_steps="{sample_every_n_steps}"'
+    return run_cmd

library/svd_merge_lora_gui.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import gradio as gr
+from easygui import msgbox
+import subprocess
+import os
+from .common_gui import (
+    get_saveasfilename_path,
+    get_any_file_path,
+    get_file_path,
+)
+folder_symbol = '\U0001f4c2'  # 📂
+refresh_symbol = '\U0001f504'  # 🔄
+save_style_symbol = '\U0001f4be'  # 💾
+document_symbol = '\U0001F4C4'   # 📄
+PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
+def svd_merge_lora(
+    lora_a_model,
+    lora_b_model,
+    ratio,
+    save_to,
+    precision,
+    save_precision,
+    new_rank,
+    new_conv_rank,
+    device,
+):
+    # Check for caption_text_input
+    if lora_a_model == '':
+        msgbox('Invalid model A file')
+        return
+    if lora_b_model == '':
+        msgbox('Invalid model B file')
+        return
+    # Check if source model exist
+    if not os.path.isfile(lora_a_model):
+        msgbox('The provided model A is not a file')
+        return
+    if not os.path.isfile(lora_b_model):
+        msgbox('The provided model B is not a file')
+        return
+    ratio_a = ratio
+    ratio_b = 1 - ratio
+    run_cmd = f'{PYTHON} "{os.path.join("networks","svd_merge_lora.py")}"'
+    run_cmd += f' --save_precision {save_precision}'
+    run_cmd += f' --precision {precision}'
+    run_cmd += f' --save_to "{save_to}"'
+    run_cmd += f' --models "{lora_a_model}" "{lora_b_model}"'
+    run_cmd += f' --ratios {ratio_a} {ratio_b}'
+    run_cmd += f' --device {device}'
+    run_cmd += f' --new_rank "{new_rank}"'
+    run_cmd += f' --new_conv_rank "{new_conv_rank}"'
+    print(run_cmd)
+    # Run the command
+    if os.name == 'posix':
+        os.system(run_cmd)
+    else:
+        subprocess.run(run_cmd)
+###
+# Gradio UI
+###
+def gradio_svd_merge_lora_tab():
+    with gr.Tab('Merge LoRA (SVD)'):
+        gr.Markdown('This utility can merge two LoRA networks together.')
+        lora_ext = gr.Textbox(value='*.safetensors *.pt', visible=False)
+        lora_ext_name = gr.Textbox(value='LoRA model types', visible=False)
+        with gr.Row():
+            lora_a_model = gr.Textbox(
+                label='LoRA model "A"',
+                placeholder='Path to the LoRA A model',
+                interactive=True,
+            )
+            button_lora_a_model_file = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_lora_a_model_file.click(
+                get_file_path,
+                inputs=[lora_a_model, lora_ext, lora_ext_name],
+                outputs=lora_a_model,
+                show_progress=False,
+            )
+            lora_b_model = gr.Textbox(
+                label='LoRA model "B"',
+                placeholder='Path to the LoRA B model',
+                interactive=True,
+            )
+            button_lora_b_model_file = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_lora_b_model_file.click(
+                get_file_path,
+                inputs=[lora_b_model, lora_ext, lora_ext_name],
+                outputs=lora_b_model,
+                show_progress=False,
+            )
+        with gr.Row():
+            ratio = gr.Slider(
+                label='Merge ratio (eg: 0.7 mean 70% of model A and 30% of model B',
+                minimum=0,
+                maximum=1,
+                step=0.01,
+                value=0.5,
+                interactive=True,
+            )
+            new_rank = gr.Slider(
+                label='New Rank',
+                minimum=1,
+                maximum=1024,
+                step=1,
+                value=128,
+                interactive=True,
+            )
+            new_conv_rank = gr.Slider(
+                label='New Conv Rank',
+                minimum=1,
+                maximum=1024,
+                step=1,
+                value=128,
+                interactive=True,
+            )
+        with gr.Row():
+            save_to = gr.Textbox(
+                label='Save to',
+                placeholder='path for the file to save...',
+                interactive=True,
+            )
+            button_save_to = gr.Button(
+                folder_symbol, elem_id='open_folder_small'
+            )
+            button_save_to.click(
+                get_saveasfilename_path,
+                inputs=[save_to, lora_ext, lora_ext_name],
+                outputs=save_to,
+                show_progress=False,
+            )
+            precision = gr.Dropdown(
+                label='Merge precision',
+                choices=['fp16', 'bf16', 'float'],
+                value='float',
+                interactive=True,
+            )
+            save_precision = gr.Dropdown(
+                label='Save precision',
+                choices=['fp16', 'bf16', 'float'],
+                value='float',
+                interactive=True,
+            )
+            device = gr.Dropdown(
+                label='Device',
+                choices=[
+                    'cpu',
+                    'cuda',
+                ],
+                value='cuda',
+                interactive=True,
+            )
+        convert_button = gr.Button('Merge model')
+        convert_button.click(
+            svd_merge_lora,
+            inputs=[
+                lora_a_model,
+                lora_b_model,
+                ratio,
+                save_to,
+                precision,
+                save_precision,
+                new_rank,
+                new_conv_rank,
+                device,
+            ],
+            show_progress=False,
+        )