Aduc_sdr

Paused

App Files Files Community

Aduc-sdr commited on Sep 4

Commit

25456f6

verified ·

1 Parent(s): 8ecb750

Create prompt_enhancer_manager.py

Browse files

Files changed (1) hide show

managers/prompt_enhancer_manager.py +99 -0

managers/prompt_enhancer_manager.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# managers/prompt_enhancer_manager.py
+#
+# Copyright (C) 2025 Carlos Rodrigues dos Santos
+#
+# Version: 1.0.0
+#
+# This is a dedicated specialist responsible for enhancing prompts. It loads
+# an image captioning model and a powerful LLM to create rich, cinematic
+# motion prompts based on visual and textual context.
+import torch
+import logging
+import yaml
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
+from pathlib import Path
+logger = logging.getLogger(__name__)
+# O prompt de sistema que guiará nosso LLM
+ENHANCER_SYSTEM_PROMPT = """You are an expert cinematic director. Your task is to write a single, rich, cinematic motion prompt.
+Analyze the user's goal and the provided image caption. Synthesize them into a flowing, descriptive paragraph under 150 words.
+Focus on the action, character expressions, camera movement, and environment. Start directly with the action.
+The final prompt must be a direct instruction for a video generation AI."""
+class PromptEnhancerManager:
+    def __init__(self):
+        logger.info("Initializing Prompt Enhancer Manager...")
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
+        try:
+            with open("config.yaml", 'r') as f:
+                config = yaml.safe_load(f)['specialists']['prompt_enhancer']
+            caption_model_name = config['image_caption_model']
+            llm_model_name = config['llm_model']
+            logger.info(f"Loading Image Caption Model: {caption_model_name}...")
+            self.caption_processor = AutoProcessor.from_pretrained(caption_model_name, trust_remote_code=True)
+            self.caption_model = AutoModelForCausalLM.from_pretrained(
+                caption_model_name, torch_dtype=self.dtype, trust_remote_code=True
+            ).to(self.device)
+            logger.info(f"Loading LLM for Prompt Enhancement: {llm_model_name}...")
+            self.llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
+            self.llm_model = AutoModelForCausalLM.from_pretrained(
+                llm_model_name,
+                torch_dtype=self.dtype,
+                device_map="auto" # Deixa o accelerate gerenciar a distribuição em GPUs
+            )
+            logger.info("Prompt Enhancer Manager initialized successfully.")
+        except Exception as e:
+            logger.critical("Failed to initialize PromptEnhancerManager.", exc_info=True)
+            raise e
+    @torch.no_grad()
+    def generate_enhanced_prompt(self, image: Image.Image, user_prompt: str) -> str:
+        """
+        Takes a reference image and a user prompt, and returns an enhanced,
+        cinematic prompt generated by the LLM.
+        """
+        logger.info("Generating enhanced prompt...")
+        # 1. Gerar a caption da imagem
+        caption_task_prompt = "<MORE_DETAILED_CAPTION>"
+        inputs = self.caption_processor(
+            text=caption_task_prompt, images=image, return_tensors="pt"
+        ).to(self.device, self.dtype)
+        generated_ids = self.caption_model.generate(**inputs, max_new_tokens=1024)
+        generated_texts = self.caption_processor.batch_decode(generated_ids, skip_special_tokens=True)
+        image_caption = generated_texts[0].split(":", 1)[-1].strip()
+        logger.info(f"Generated Image Caption: '{image_caption}'")
+        # 2. Construir a conversa para o LLM
+        messages = [
+            {"role": "system", "content": ENHANCER_SYSTEM_PROMPT},
+            {"role": "user", "content": f"My Goal: '{user_prompt}'\n\nReference Image Scene: '{image_caption}'"}
+        ]
+        input_ids = self.llm_tokenizer.apply_chat_template(
+            messages, add_generation_prompt=True, return_tensors="pt"
+        ).to(self.llm_model.device)
+        # 3. Gerar a resposta do LLM
+        outputs = self.llm_model.generate(input_ids, max_new_tokens=256, do_sample=True, temperature=0.6, top_p=0.9)
+        response = self.llm_tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)
+        logger.info(f"LLM Enhanced Prompt: '{response}'")
+        return response.strip()
+# --- Singleton Instantiation ---
+try:
+    prompt_enhancer_manager_singleton = PromptEnhancerManager()
+except Exception as e:
+    prompt_enhancer_manager_singleton = None
+    raise e