Aduc-sdr commited on
Commit
c957461
·
verified ·
1 Parent(s): 4e9876a

Update engineers/deformes3D_thinker.py

Browse files
Files changed (1) hide show
  1. engineers/deformes3D_thinker.py +108 -41
engineers/deformes3D_thinker.py CHANGED
@@ -1,69 +1,136 @@
1
  # engineers/deformes3D_thinker.py
2
  #
3
- # Versão 3.1.1
4
  #
5
- # Esta versão separa corretamente as responsabilidades:
6
- # 1. O Thinker monta o CONTEÚDO do prompt do usuário com todo o contexto rico.
7
- # 2. O EnhancerManager combina esse conteúdo com o seu PROMPT DE SISTEMA (carregado do config)
8
- # para gerar a resposta final do LLM.
 
 
9
 
10
  import logging
11
  from PIL import Image
12
- import gradio as gr
 
 
 
13
 
14
- from managers.prompt_enhancer_manager import prompt_enhancer_manager_singleton
 
15
 
16
  logger = logging.getLogger(__name__)
17
 
18
  class Deformes3DThinker:
 
 
 
 
 
19
  def __init__(self):
20
- if not prompt_enhancer_manager_singleton:
21
- raise RuntimeError("Deformes3DThinker requires the PromptEnhancerManager.")
22
- self.enhancer = prompt_enhancer_manager_singleton
23
- logger.info("Deformes3DThinker initialized and linked to PromptEnhancerManager.")
 
 
 
 
 
 
 
 
 
 
 
 
24
 
 
25
  def get_enhanced_motion_prompt(self, global_prompt: str, story_history: str,
26
  past_keyframe_path: str, present_keyframe_path: str, future_keyframe_path: str,
27
  past_scene_desc: str, present_scene_desc: str, future_scene_desc: str) -> str:
 
 
 
 
 
 
 
 
28
  try:
29
- logger.info("Assembling rich context for cinematic prompt...")
30
- # 1. Obter descrições (captions) para as imagens de contexto
31
- past_image = Image.open(past_keyframe_path).convert("RGB")
32
  present_image = Image.open(present_keyframe_path).convert("RGB")
33
- future_image = Image.open(future_keyframe_path).convert("RGB")
34
 
35
- past_caption = self.enhancer.get_image_caption(past_image)
36
- present_caption = self.enhancer.get_image_caption(present_image)
37
- future_caption = self.enhancer.get_image_caption(future_image)
38
 
39
- # 2. Montar a parte do "usuário" do prompt com TODO o contexto rico.
40
- # Isso é o que o LLM verá como a instrução do usuário.
41
- user_content_prompt = f"""
42
- CONTEXT:
43
- - Global Story Goal: {global_prompt}
44
- - Creative History (what happened before): {story_history}
45
-
46
- SCENE ANALYSIS:
47
- - The Past: "{past_scene_desc}" (Visual: {past_caption})
48
- - The Present: "{present_scene_desc}" (Visual: {present_caption})
49
- - The Future Goal: "{future_scene_desc}" (Visual: {future_caption})
50
-
51
- TASK:
52
- Based on all the context above, write the cinematic motion prompt that connects the PRESENT to the FUTURE.
53
- """
54
 
55
- # 3. Delegar a chamada final ao LLM, passando apenas o conteúdo do usuário.
56
- # O EnhancerManager adicionará o "system prompt" automaticamente.
57
- enhanced_prompt = self.enhancer.get_llm_enhanced_prompt(user_content_prompt)
 
 
 
 
 
 
58
 
 
 
59
  logger.info(f"Deformes3DThinker received enhanced prompt: '{enhanced_prompt}'")
60
  return enhanced_prompt
61
 
62
  except Exception as e:
63
- logger.error(f"The Film Director (Deformes3D Thinker) failed: {e}. Using fallback.", exc_info=True)
64
- fallback_prompt = f"A smooth, continuous cinematic transition from '{present_scene_desc}' to '{future_scene_desc}'."
65
- logger.info(f"Deformes3DThinker Fallback -> Motion Prompt: '{fallback_prompt}'")
66
- return fallback_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  # --- Singleton Instantiation ---
69
- deformes3d_thinker_singleton = Deformes3DThinker()
 
 
 
 
 
 
1
  # engineers/deformes3D_thinker.py
2
  #
3
+ # Copyright (C) 2025 Carlos Rodrigues dos Santos
4
  #
5
+ # Version: 4.0.0 (Definitive)
6
+ #
7
+ # This is the definitive, robust implementation. It directly contains the prompt
8
+ # enhancement logic copied from the LTX pipeline's utils. It accesses the
9
+ # enhancement models loaded by the LTX Manager and performs the captioning
10
+ # and LLM generation steps locally, ensuring full control and compatibility.
11
 
12
  import logging
13
  from PIL import Image
14
+ import torch
15
+
16
+ # Importa o singleton do LTX para ter acesso à sua pipeline e aos modelos nela
17
+ from managers.ltx_manager import ltx_manager_singleton
18
 
19
+ # Importa o prompt de sistema do LTX para garantir consistência
20
+ from ltx_video.utils.prompt_enhance_utils import I2V_CINEMATIC_PROMPT
21
 
22
  logger = logging.getLogger(__name__)
23
 
24
  class Deformes3DThinker:
25
+ """
26
+ The tactical specialist that now directly implements the prompt enhancement
27
+ logic, using the models provided by the LTX pipeline.
28
+ """
29
+
30
  def __init__(self):
31
+ # Acessa a pipeline exposta para obter os modelos necessários
32
+ pipeline = ltx_manager_singleton.prompt_enhancement_pipeline
33
+ if not pipeline:
34
+ raise RuntimeError("Deformes3DThinker could not access the LTX pipeline.")
35
+
36
+ # Armazena os modelos e processadores como atributos diretos
37
+ self.caption_model = pipeline.prompt_enhancer_image_caption_model
38
+ self.caption_processor = pipeline.prompt_enhancer_image_caption_processor
39
+ self.llm_model = pipeline.prompt_enhancer_llm_model
40
+ self.llm_tokenizer = pipeline.prompt_enhancer_llm_tokenizer
41
+
42
+ # Verifica se os modelos foram realmente carregados
43
+ if not all([self.caption_model, self.caption_processor, self.llm_model, self.llm_tokenizer]):
44
+ logger.warning("Deformes3DThinker initialized, but one or more enhancement models were not loaded by the LTX pipeline. Fallback will be used.")
45
+ else:
46
+ logger.info("Deformes3DThinker initialized and successfully linked to LTX enhancement models.")
47
 
48
+ @torch.no_grad()
49
  def get_enhanced_motion_prompt(self, global_prompt: str, story_history: str,
50
  past_keyframe_path: str, present_keyframe_path: str, future_keyframe_path: str,
51
  past_scene_desc: str, present_scene_desc: str, future_scene_desc: str) -> str:
52
+ """
53
+ Generates a refined motion prompt by directly executing the enhancement pipeline logic.
54
+ """
55
+ # Verifica se os modelos estão disponíveis antes de tentar usá-los
56
+ if not all([self.caption_model, self.caption_processor, self.llm_model, self.llm_tokenizer]):
57
+ logger.warning("Enhancement models not available. Using fallback prompt.")
58
+ return f"A cinematic transition from '{present_scene_desc}' to '{future_scene_desc}'."
59
+
60
  try:
 
 
 
61
  present_image = Image.open(present_keyframe_path).convert("RGB")
 
62
 
63
+ # --- INÍCIO DA LÓGICA COPIADA E ADAPTADA DO LTX ---
 
 
64
 
65
+ # 1. Gerar a caption da imagem de referência (presente)
66
+ image_captions = self._generate_image_captions([present_image])
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ # 2. Construir o prompt para o LLM
69
+ # Usamos a cena futura como o "prompt do usuário"
70
+ messages = [
71
+ {"role": "system", "content": I2V_CINEMATIC_PROMPT},
72
+ {"role": "user", "content": f"user_prompt: {future_scene_desc}\nimage_caption: {image_captions[0]}"},
73
+ ]
74
+
75
+ # 3. Gerar e decodificar o prompt final com o LLM
76
+ enhanced_prompt = self._generate_and_decode_prompts(messages)
77
 
78
+ # --- FIM DA LÓGICA COPIADA E ADAPTADA ---
79
+
80
  logger.info(f"Deformes3DThinker received enhanced prompt: '{enhanced_prompt}'")
81
  return enhanced_prompt
82
 
83
  except Exception as e:
84
+ logger.error(f"The Film Director (Deformes3D Thinker) failed during enhancement: {e}. Using fallback.", exc_info=True)
85
+ return f"A smooth, continuous cinematic transition from '{present_scene_desc}' to '{future_scene_desc}'."
86
+
87
+ def _generate_image_captions(self, images: list[Image.Image]) -> list[str]:
88
+ """
89
+ Lógica interna para gerar captions, copiada do LTX utils.
90
+ """
91
+ # O modelo Florence-2 do LTX não usa um system_prompt aqui, mas um task_prompt
92
+ task_prompt = "<MORE_DETAILED_CAPTION>"
93
+ inputs = self.caption_processor(
94
+ text=[task_prompt] * len(images), images=images, return_tensors="pt"
95
+ ).to(self.caption_model.device)
96
+
97
+ generated_ids = self.caption_model.generate(
98
+ input_ids=inputs["input_ids"],
99
+ pixel_values=inputs["pixel_values"],
100
+ max_new_tokens=1024,
101
+ num_beams=3,
102
+ )
103
+
104
+ # Usa o post_process_generation para extrair a resposta limpa
105
+ generated_text = self.caption_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
106
+ processed_result = self.caption_processor.post_process_generation(
107
+ generated_text,
108
+ task=task_prompt,
109
+ image_size=(images[0].width, images[0].height)
110
+ )
111
+ return [processed_result[task_prompt]]
112
+
113
+ def _generate_and_decode_prompts(self, messages: list[dict]) -> str:
114
+ """
115
+ Lógica interna para gerar prompt com o LLM, copiada do LTX utils.
116
+ """
117
+ text = self.llm_tokenizer.apply_chat_template(
118
+ messages, tokenize=False, add_generation_prompt=True
119
+ )
120
+ model_inputs = self.llm_tokenizer([text], return_tensors="pt").to(self.llm_model.device)
121
+
122
+ output_ids = self.llm_model.generate(**model_inputs, max_new_tokens=256)
123
+
124
+ input_ids_len = model_inputs.input_ids.shape[1]
125
+ decoded_prompts = self.llm_tokenizer.batch_decode(
126
+ output_ids[:, input_ids_len:], skip_special_tokens=True
127
+ )
128
+ return decoded_prompts[0].strip()
129
 
130
  # --- Singleton Instantiation ---
131
+ try:
132
+ deformes3d_thinker_singleton = Deformes3DThinker()
133
+ except Exception as e:
134
+ # A falha já terá sido logada dentro do __init__
135
+ deformes3d_thinker_singleton = None
136
+ raise e