Spaces:
Paused
Paused
| import torch | |
| import gradio as gr | |
| from transformers import AutoModel | |
| from transformers import AutoProcessor | |
| import spaces | |
| # Load pre-trained models for image captioning and language modeling | |
| model3 = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True) | |
| processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True) | |
| # Define a function for image captioning | |
| def videochat(image3, prompt3): | |
| # Process input image and prompt | |
| inputs = processor(text=[prompt3], images=[image3], return_tensors="pt") | |
| # Generate captions | |
| with torch.inference_mode(): | |
| output = model3.generate( | |
| **inputs, | |
| do_sample=False, | |
| use_cache=True, | |
| max_new_tokens=256, | |
| eos_token_id=151645, | |
| pad_token_id=processor.tokenizer.pad_token_id | |
| ) | |
| prompt_len = inputs["input_ids"].shape[1] | |
| # Decode and return the generated captions | |
| decoded_text = processor.batch_decode(output[:, prompt_len:])[0] | |
| if decoded_text.endswith("<|im_end|>"): | |
| decoded_text = decoded_text[:-10] | |
| yield decoded_text |