microsoft
/

LLM2CLIP-Openai-L-14-336

Zero-Shot Classification

Model card Files Files and versions Community

Gengzigang commited on 2 days ago

Commit

d683b1e

•

1 Parent(s): f489540

update

Files changed (1) hide show

README.md +3 -3

README.md CHANGED Viewed

@@ -70,14 +70,14 @@ processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-33
 model_name_or_path = "microsoft/LLM2CLIP-Openai-L-14-336" # or /path/to/local/LLM2CLIP-Openai-L-14-336
 model = AutoModel.from_pretrained(
     model_name_or_path,
-    torch_dtype=torch.float16,
     trust_remote_code=True).to('cuda').eval()
 llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
 config = AutoConfig.from_pretrained(
     llm_model_name, trust_remote_code=True
 )
-llm_model = AutoModel.from_pretrained(llm_model_name, config=config,trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
 llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' #  Workaround for LLM2VEC
 l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
@@ -87,10 +87,10 @@ image_path = "CLIP.png"
 image = Image.open(image_path)
 input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
 with torch.no_grad(), torch.cuda.amp.autocast():
     image_features = model.get_image_features(input_pixels)
-    text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
     text_features = model.get_text_features(text_features)
     image_features /= image_features.norm(dim=-1, keepdim=True)

 model_name_or_path = "microsoft/LLM2CLIP-Openai-L-14-336" # or /path/to/local/LLM2CLIP-Openai-L-14-336
 model = AutoModel.from_pretrained(
     model_name_or_path,
+    torch_dtype=torch.bfloat16,
     trust_remote_code=True).to('cuda').eval()
 llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
 config = AutoConfig.from_pretrained(
     llm_model_name, trust_remote_code=True
 )
+llm_model = AutoModel.from_pretrained(llm_model_name, torch.bfloat16, config=config, trust_remote_code=True)
 tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
 llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' #  Workaround for LLM2VEC
 l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
 image = Image.open(image_path)
 input_pixels = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
+text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
 with torch.no_grad(), torch.cuda.amp.autocast():
     image_features = model.get_image_features(input_pixels)
     text_features = model.get_text_features(text_features)
     image_features /= image_features.norm(dim=-1, keepdim=True)