21 lines
680 B
Python
21 lines
680 B
Python
from transformers import FuyuProcessor, FuyuForCausalLM
|
|
from PIL import Image
|
|
|
|
image = Image.open("stackoverflow.png").convert("RGB")
|
|
|
|
# load model and processor
|
|
model_id = "adept/fuyu-8b"
|
|
processor = FuyuProcessor.from_pretrained(model_id)
|
|
model = FuyuForCausalLM.from_pretrained(model_id, device_map="cuda:0")
|
|
|
|
# prepare inputs for the model
|
|
text_prompt = "Description:\n"
|
|
|
|
inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0")
|
|
|
|
# autoregressively generate text
|
|
generation_output = model.generate(**inputs, max_new_tokens=100)
|
|
generation_text = processor.batch_decode(generation_output[:, -100:], skip_special_tokens=True)
|
|
|
|
print(generation_text)
|