autonomous019 commited on
Commit
43604c6
1 Parent(s): 41c00ad

code in comments pre-staging

Browse files
Files changed (1) hide show
  1. app.py +16 -2
app.py CHANGED
@@ -5,6 +5,7 @@ import requests
5
  import matplotlib.pyplot as plt
6
  import gradio as gr
7
  from transformers import ImageClassificationPipeline, PerceiverForImageClassificationConvProcessing, PerceiverFeatureExtractor
 
8
  from transformers import AutoTokenizer
9
  import torch
10
 
@@ -27,12 +28,25 @@ model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/
27
  image_pipe = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
28
 
29
  '''
30
- repo_name = "ydshieh/vit-gpt2-coco-en"
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  feature_extractor2 = ViTFeatureExtractor.from_pretrained(repo_name)
33
  tokenizer = AutoTokenizer.from_pretrained(repo_name)
34
  model2 = VisionEncoderDecoderModel.from_pretrained(repo_name)
35
- pixel_values = feature_extractor2(image, return_tensors="pt").pixel_values
36
 
37
  # autoregressively generate text (using beam search or other decoding strategy)
38
  generated_ids = model2.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True)
 
5
  import matplotlib.pyplot as plt
6
  import gradio as gr
7
  from transformers import ImageClassificationPipeline, PerceiverForImageClassificationConvProcessing, PerceiverFeatureExtractor
8
+ from transformers import VisionEncoderDecoderModel
9
  from transformers import AutoTokenizer
10
  import torch
11
 
 
28
  image_pipe = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
29
 
30
  '''
 
31
 
32
+
33
+ # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
34
+ model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
35
+ "google/vit-base-patch16-224-in21k", "bert-base-uncased"
36
+ )
37
+ # saving model after fine-tuning
38
+ model.save_pretrained("./vit-bert")
39
+ # load fine-tuned model
40
+ model = VisionEncoderDecoderModel.from_pretrained("./vit-bert")
41
+
42
+
43
+
44
+ repo_name = "ydshieh/vit-gpt2-coco-en"
45
+ test_image = "cats.jpg"
46
  feature_extractor2 = ViTFeatureExtractor.from_pretrained(repo_name)
47
  tokenizer = AutoTokenizer.from_pretrained(repo_name)
48
  model2 = VisionEncoderDecoderModel.from_pretrained(repo_name)
49
+ pixel_values = feature_extractor2(test_image, return_tensors="pt").pixel_values
50
 
51
  # autoregressively generate text (using beam search or other decoding strategy)
52
  generated_ids = model2.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True)