Aduc_sdr

Paused

App Files Files Community

Aduc-sdr commited on Sep 4

Commit

4e9876a

verified ·

1 Parent(s): b0aeb06

Update managers/prompt_enhancer_manager.py

Browse files

Files changed (1) hide show

managers/prompt_enhancer_manager.py +91 -79

managers/prompt_enhancer_manager.py CHANGED Viewed

@@ -1,80 +1,98 @@
-# managers/prompt_enhancer_manager.py
 #
 # Copyright (C) 2025 Carlos Rodrigues dos Santos
 #
-# Version: 2.0.1 (Definitive Fix)
 #
-# This version re-introduces the essential `attn_implementation="eager"` parameter
-# during the loading of the Florence-2 model. This is required to solve the
-# '_supports_sdpa' AttributeError in our specific environment, while keeping
-# the correct inference pipeline from the functional example.
-import torch
 import logging
-import yaml
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
-from pathlib import Path
 logger = logging.getLogger(__name__)
-class PromptEnhancerManager:
     def __init__(self):
-        logger.info("Initializing Prompt Enhancer Manager...")
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
-        self.system_prompt = ""
         try:
-            with open("config.yaml", 'r') as f:
-                config = yaml.safe_load(f)['specialists']['prompt_enhancer']
-            caption_model_name = config['image_caption_model']
-            llm_model_name = config['llm_model']
-            prompt_filename = config.get('prompt_file')
-            if not prompt_filename:
-                raise ValueError("Config for prompt_enhancer is missing the 'prompt_file' key.")
-            prompt_path = Path(prompt_filename)
-            if not prompt_path.is_file():
-                raise FileNotFoundError(f"Enhancer prompt file not found at: {prompt_path}")
-            self.system_prompt = prompt_path.read_text(encoding="utf-8").strip()
-            logger.info(f"Loaded system prompt for enhancer from: {prompt_path}")
-            logger.info(f"Loading Image Caption Model: {caption_model_name}...")
-            self.caption_processor = AutoProcessor.from_pretrained(caption_model_name, trust_remote_code=True)
-            # <--- CORREÇÃO DEFINITIVA AQUI --->
-            # Adicionando de volta o parâmetro CRÍTICO para compatibilidade
-            self.caption_model = AutoModelForCausalLM.from_pretrained(
-                caption_model_name,
-                torch_dtype=self.dtype,
-                trust_remote_code=True,
-                attn_implementation="eager"  # Essencial para evitar o erro _supports_sdpa
-            ).to(self.device)
-            # <--- FIM DA CORREÇÃO --->
-            logger.info(f"Loading LLM for Prompt Enhancement: {llm_model_name}...")
-            self.llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
-            self.llm_model = AutoModelForCausalLM.from_pretrained(
-                llm_model_name, torch_dtype=self.dtype, device_map="auto"
-            )
-            logger.info("Prompt Enhancer Manager initialized successfully.")
         except Exception as e:
-            logger.critical("Failed to initialize PromptEnhancerManager.", exc_info=True)
-            raise e
-    @torch.no_grad()
-    def get_image_caption(self, image: Image.Image) -> str:
         """
-        Tool 1: Describes a single image using the official Florence-2 inference pipeline.
         """
-        task_prompt = '<MORE_DETAILED_CAPTION>'
         inputs = self.caption_processor(
-            text=task_prompt, images=image, return_tensors="pt"
-        ).to(self.device, self.dtype)
         generated_ids = self.caption_model.generate(
             input_ids=inputs["input_ids"],
@@ -83,42 +101,36 @@ class PromptEnhancerManager:
             num_beams=3,
         )
         generated_text = self.caption_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
         processed_result = self.caption_processor.post_process_generation(
             generated_text,
             task=task_prompt,
-            image_size=(image.width, image.height)
         )
-        caption = processed_result[task_prompt]
-        return caption
-    @torch.no_grad()
-    def get_llm_enhanced_prompt(self, user_content_prompt: str) -> str:
         """
-        Tool 2: Takes a pre-formatted user content prompt, combines it with the
-        system prompt, and gets a cinematic response from the LLM.
         """
-        messages = [
-            {"role": "system", "content": self.system_prompt},
-            {"role": "user", "content": user_content_prompt}
-        ]
-        input_ids = self.llm_tokenizer.apply_chat_template(
-            messages, add_generation_prompt=True, return_tensors="pt"
-        ).to(self.llm_model.device)
-        outputs = self.llm_model.generate(
-            input_ids, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.9
         )
-        response = self.llm_tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
-        return response.strip()
 # --- Singleton Instantiation ---
 try:
-    prompt_enhancer_manager_singleton = PromptEnhancerManager()
 except Exception as e:
-    prompt_enhancer_manager_singleton = None
     raise e

+# engineers/deformes3D_thinker.py
 #
 # Copyright (C) 2025 Carlos Rodrigues dos Santos
 #
+# Version: 4.0.0 (Definitive)
 #
+# This is the definitive, robust implementation. It directly contains the prompt
+# enhancement logic copied from the LTX pipeline's utils. It accesses the
+# enhancement models loaded by the LTX Manager and performs the captioning
+# and LLM generation steps locally, ensuring full control and compatibility.
 import logging
 from PIL import Image
+import torch
+# Importa o singleton do LTX para ter acesso à sua pipeline e aos modelos nela
+from managers.ltx_manager import ltx_manager_singleton
+# Importa o prompt de sistema do LTX para garantir consistência
+from ltx_video.utils.prompt_enhance_utils import I2V_CINEMATIC_PROMPT
 logger = logging.getLogger(__name__)
+class Deformes3DThinker:
+    """
+    The tactical specialist that now directly implements the prompt enhancement
+    logic, using the models provided by the LTX pipeline.
+    """
     def __init__(self):
+        # Acessa a pipeline exposta para obter os modelos necessários
+        pipeline = ltx_manager_singleton.prompt_enhancement_pipeline
+        if not pipeline:
+            raise RuntimeError("Deformes3DThinker could not access the LTX pipeline.")
+        # Armazena os modelos e processadores como atributos diretos
+        self.caption_model = pipeline.prompt_enhancer_image_caption_model
+        self.caption_processor = pipeline.prompt_enhancer_image_caption_processor
+        self.llm_model = pipeline.prompt_enhancer_llm_model
+        self.llm_tokenizer = pipeline.prompt_enhancer_llm_tokenizer
+        # Verifica se os modelos foram realmente carregados
+        if not all([self.caption_model, self.caption_processor, self.llm_model, self.llm_tokenizer]):
+            logger.warning("Deformes3DThinker initialized, but one or more enhancement models were not loaded by the LTX pipeline. Fallback will be used.")
+        else:
+            logger.info("Deformes3DThinker initialized and successfully linked to LTX enhancement models.")
+    @torch.no_grad()
+    def get_enhanced_motion_prompt(self, global_prompt: str, story_history: str,
+                                   past_keyframe_path: str, present_keyframe_path: str, future_keyframe_path: str,
+                                   past_scene_desc: str, present_scene_desc: str, future_scene_desc: str) -> str:
+        """
+        Generates a refined motion prompt by directly executing the enhancement pipeline logic.
+        """
+        # Verifica se os modelos estão disponíveis antes de tentar usá-los
+        if not all([self.caption_model, self.caption_processor, self.llm_model, self.llm_tokenizer]):
+            logger.warning("Enhancement models not available. Using fallback prompt.")
+            return f"A cinematic transition from '{present_scene_desc}' to '{future_scene_desc}'."
         try:
+            present_image = Image.open(present_keyframe_path).convert("RGB")
+            # --- INÍCIO DA LÓGICA COPIADA E ADAPTADA DO LTX ---
+            # 1. Gerar a caption da imagem de referência (presente)
+            image_captions = self._generate_image_captions([present_image])
+            # 2. Construir o prompt para o LLM
+            # Usamos a cena futura como o "prompt do usuário"
+            messages = [
+                {"role": "system", "content": I2V_CINEMATIC_PROMPT},
+                {"role": "user", "content": f"user_prompt: {future_scene_desc}\nimage_caption: {image_captions[0]}"},
+            ]
+            # 3. Gerar e decodificar o prompt final com o LLM
+            enhanced_prompt = self._generate_and_decode_prompts(messages)
+            # --- FIM DA LÓGICA COPIADA E ADAPTADA ---
+            logger.info(f"Deformes3DThinker received enhanced prompt: '{enhanced_prompt}'")
+            return enhanced_prompt
         except Exception as e:
+            logger.error(f"The Film Director (Deformes3D Thinker) failed during enhancement: {e}. Using fallback.", exc_info=True)
+            return f"A smooth, continuous cinematic transition from '{present_scene_desc}' to '{future_scene_desc}'."
+    def _generate_image_captions(self, images: list[Image.Image]) -> list[str]:
         """
+        Lógica interna para gerar captions, copiada do LTX utils.
         """
+        # O modelo Florence-2 do LTX não usa um system_prompt aqui, mas um task_prompt
+        task_prompt = "<MORE_DETAILED_CAPTION>"
         inputs = self.caption_processor(
+            text=[task_prompt] * len(images), images=images, return_tensors="pt"
+        ).to(self.caption_model.device)
         generated_ids = self.caption_model.generate(
             input_ids=inputs["input_ids"],
             num_beams=3,
         )
+        # Usa o post_process_generation para extrair a resposta limpa
         generated_text = self.caption_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
         processed_result = self.caption_processor.post_process_generation(
             generated_text,
             task=task_prompt,
+            image_size=(images[0].width, images[0].height)
         )
+        return [processed_result[task_prompt]]
+    def _generate_and_decode_prompts(self, messages: list[dict]) -> str:
         """
+        Lógica interna para gerar prompt com o LLM, copiada do LTX utils.
         """
+        text = self.llm_tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
         )
+        model_inputs = self.llm_tokenizer([text], return_tensors="pt").to(self.llm_model.device)
+        output_ids = self.llm_model.generate(**model_inputs, max_new_tokens=256)
+        input_ids_len = model_inputs.input_ids.shape[1]
+        decoded_prompts = self.llm_tokenizer.batch_decode(
+            output_ids[:, input_ids_len:], skip_special_tokens=True
+        )
+        return decoded_prompts[0].strip()
 # --- Singleton Instantiation ---
 try:
+    deformes3d_thinker_singleton = Deformes3DThinker()
 except Exception as e:
+    # A falha já terá sido logada dentro do __init__
+    deformes3d_thinker_singleton = None
     raise e