from transformers import FuyuProcessor, FuyuForCausalLM from PIL import Image image = Image.open("stackoverflow.png").convert("RGB") # load model and processor model_id = "adept/fuyu-8b" processor = FuyuProcessor.from_pretrained(model_id) model = FuyuForCausalLM.from_pretrained(model_id, device_map="cuda:0") # prepare inputs for the model text_prompt = "Description:\n" inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0") # autoregressively generate text generation_output = model.generate(**inputs, max_new_tokens=100) generation_text = processor.batch_decode(generation_output[:, -100:], skip_special_tokens=True) print(generation_text)