File size: 6,243 Bytes
12e0b76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a957837
12e0b76
 
f2fcfe2
12e0b76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# engineers/deformes3D_thinker.py
#
# Copyright (C) 2025 Carlos Rodrigues dos Santos
#
# Version: 4.0.0 (Definitive)
#
# This is the definitive, robust implementation. It directly contains the prompt
# enhancement logic copied from the LTX pipeline's utils. It accesses the
# enhancement models loaded by the LTX Manager and performs the captioning
# and LLM generation steps locally, ensuring full control and compatibility.

import logging
from PIL import Image
import torch

# Importa o singleton do LTX para ter acesso à sua pipeline e aos modelos nela
from ..managers.ltx_manager import ltx_manager_singleton

# Importa o prompt de sistema do LTX para garantir consistência
from ltx_video.utils.prompt_enhance_utils import I2V_CINEMATIC_PROMPT

logger = logging.getLogger(__name__)

class Deformes3DThinker:
    """
    The tactical specialist that now directly implements the prompt enhancement
    logic, using the models provided by the LTX pipeline.
    """
    
    def __init__(self):
        # Acessa a pipeline exposta para obter os modelos necessários
        pipeline = ltx_manager_singleton.prompt_enhancement_pipeline
        if not pipeline:
            raise RuntimeError("Deformes3DThinker could not access the LTX pipeline.")
        
        # Armazena os modelos e processadores como atributos diretos
        self.caption_model = pipeline.prompt_enhancer_image_caption_model
        self.caption_processor = pipeline.prompt_enhancer_image_caption_processor
        self.llm_model = pipeline.prompt_enhancer_llm_model
        self.llm_tokenizer = pipeline.prompt_enhancer_llm_tokenizer
        
        # Verifica se os modelos foram realmente carregados
        if not all([self.caption_model, self.caption_processor, self.llm_model, self.llm_tokenizer]):
            logger.warning("Deformes3DThinker initialized, but one or more enhancement models were not loaded by the LTX pipeline. Fallback will be used.")
        else:
            logger.info("Deformes3DThinker initialized and successfully linked to LTX enhancement models.")

    @torch.no_grad()
    def get_enhanced_motion_prompt(self, global_prompt: str, story_history: str, 
                                   past_keyframe_path: str, present_keyframe_path: str, future_keyframe_path: str,
                                   past_scene_desc: str, present_scene_desc: str, future_scene_desc: str) -> str:
        """
        Generates a refined motion prompt by directly executing the enhancement pipeline logic.
        """
        # Verifica se os modelos estão disponíveis antes de tentar usá-los
        if not all([self.caption_model, self.caption_processor, self.llm_model, self.llm_tokenizer]):
            logger.warning("Enhancement models not available. Using fallback prompt.")
            return f"A cinematic transition from '{present_scene_desc}' to '{future_scene_desc}'."

        try:
            present_image = Image.open(present_keyframe_path).convert("RGB")
            
            # --- INÍCIO DA LÓGICA COPIADA E ADAPTADA DO LTX ---
            
            # 1. Gerar a caption da imagem de referência (presente)
            image_captions = self._generate_image_captions([present_image])
            
            # 2. Construir o prompt para o LLM
            # Usamos a cena futura como o "prompt do usuário"
            messages = [
                {"role": "system", "content": I2V_CINEMATIC_PROMPT},
                {"role": "user", "content": f"user_prompt: {future_scene_desc}\nimage_caption: {image_captions[0]}"},
            ]

            # 3. Gerar e decodificar o prompt final com o LLM
            enhanced_prompt = self._generate_and_decode_prompts(messages)
            
            # --- FIM DA LÓGICA COPIADA E ADAPTADA ---

            logger.info(f"Deformes3DThinker received enhanced prompt: '{enhanced_prompt}'")
            return enhanced_prompt

        except Exception as e:
            logger.error(f"The Film Director (Deformes3D Thinker) failed during enhancement: {e}. Using fallback.", exc_info=True)
            return f"A smooth, continuous cinematic transition from '{present_scene_desc}' to '{future_scene_desc}'."

    def _generate_image_captions(self, images: list[Image.Image]) -> list[str]:
        """
        Lógica interna para gerar captions, copiada do LTX utils.
        """
        # O modelo Florence-2 do LTX não usa um system_prompt aqui, mas um task_prompt
        task_prompt = "<MORE_DETAILED_CAPTION>"
        inputs = self.caption_processor(
            text=[task_prompt] * len(images), images=images, return_tensors="pt"
        ).to(self.caption_model.device)

        generated_ids = self.caption_model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            num_beams=3,
        )
        
        # Usa o post_process_generation para extrair a resposta limpa
        generated_text = self.caption_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
        processed_result = self.caption_processor.post_process_generation(
            generated_text, 
            task=task_prompt, 
            image_size=(images[0].width, images[0].height)
        )
        return [processed_result[task_prompt]]

    def _generate_and_decode_prompts(self, messages: list[dict]) -> str:
        """
        Lógica interna para gerar prompt com o LLM, copiada do LTX utils.
        """
        text = self.llm_tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        model_inputs = self.llm_tokenizer([text], return_tensors="pt").to(self.llm_model.device)

        output_ids = self.llm_model.generate(**model_inputs, max_new_tokens=256)
        
        input_ids_len = model_inputs.input_ids.shape[1]
        decoded_prompts = self.llm_tokenizer.batch_decode(
            output_ids[:, input_ids_len:], skip_special_tokens=True
        )
        return decoded_prompts[0].strip()

# --- Singleton Instantiation ---
try:
    deformes3d_thinker_singleton = Deformes3DThinker()
except Exception as e:
    # A falha já terá sido logada dentro do __init__
    deformes3d_thinker_singleton = None
    raise e