google
/

pix2struct-textcaps-base

@@ -68,19 +68,104 @@ processor.push_to_hub("USERNAME/MODEL_NAME")
 ## Running the model
-TODO
-# Results
-TODO
-# Introduction to UL2
-TODO
-# Fine-tuning
-TODO
 # Contribution

 ## Running the model
+### In full precision, on CPU:
+You can run the model in full precision on CPU:
+```python
+import requests
+from PIL import Image
+from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+model = Pix2StructForConditionalGeneration.from_pretrained("ybelkada/pix2struct-textcaps-base")
+processor = Pix2StructProcessor.from_pretrained("ybelkada/pix2struct-textcaps-base")
+# image only
+inputs = processor(images=image, return_tensors="pt")
+predictions = model.generate(**inputs)
+print(processor.decode(predictions[0], skip_special_tokens=True))
+>>> A stop sign is on a street corner.
+```
+### In full precision, on GPU:
+You can run the model in full precision on CPU:
+```python
+import requests
+from PIL import Image
+from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+model = Pix2StructForConditionalGeneration.from_pretrained("ybelkada/pix2struct-textcaps-base").to("cuda")
+processor = Pix2StructProcessor.from_pretrained("ybelkada/pix2struct-textcaps-base")
+# image only
+inputs = processor(images=image, return_tensors="pt").to("cuda")
+predictions = model.generate(**inputs)
+print(processor.decode(predictions[0], skip_special_tokens=True))
+>>> A stop sign is on a street corner.
+```
+### In half precision, on GPU:
+You can run the model in full precision on CPU:
+```python
+import requests
+import torch
+from PIL import Image
+from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+model = Pix2StructForConditionalGeneration.from_pretrained("ybelkada/pix2struct-textcaps-base", torch_dtype=torch.bfloat16).to("cuda")
+processor = Pix2StructProcessor.from_pretrained("ybelkada/pix2struct-textcaps-base")
+# image only
+inputs = processor(images=image, return_tensors="pt").to("cuda", torch.bfloat16)
+predictions = model.generate(**inputs)
+print(processor.decode(predictions[0], skip_special_tokens=True))
+>>> A stop sign is on a street corner.
+```
+### Use different sequence length
+This model has been trained on a sequence length of `2048`. You can try to reduce the sequence length for a more memory efficient inference but you may observe some performance degradation for small sequence length (<512). Just pass `max_patches` when calling the processor:
+```python
+inputs = processor(images=image, return_tensors="pt", max_patches=512)
+```
+### Conditional generation
+You can also pre-pend some input text to perform conditional generation:
+```python
+import requests
+from PIL import Image
+from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+text = "A picture of"
+model = Pix2StructForConditionalGeneration.from_pretrained("ybelkada/pix2struct-textcaps-base")
+processor = Pix2StructProcessor.from_pretrained("ybelkada/pix2struct-textcaps-base")
+# image only
+inputs = processor(images=image, text=text, return_tensors="pt")
+predictions = model.generate(**inputs)
+print(processor.decode(predictions[0], skip_special_tokens=True))
+>>> A picture of a stop sign that says yes.
+```
 # Contribution