autonomous019 commited on
Commit
59c096a
1 Parent(s): 8be0d99

added link

Browse files
Files changed (1) hide show
  1. app.py +7 -30
app.py CHANGED
@@ -10,7 +10,7 @@ from transformers import VisionEncoderDecoderModel
10
  from transformers import AutoTokenizer
11
  import torch
12
 
13
-
14
  # option 1: load with randomly initialized weights (train from scratch)
15
 
16
  config = ViTConfig(num_hidden_layers=12, hidden_size=768)
@@ -28,22 +28,6 @@ model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/
28
 
29
  image_pipe = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
30
 
31
- '''
32
-
33
-
34
- # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
35
- model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
36
- "google/vit-base-patch16-224-in21k", "bert-base-uncased"
37
- )
38
- # saving model after fine-tuning
39
- model.save_pretrained("./vit-bert")
40
- # load fine-tuned model
41
- model = VisionEncoderDecoderModel.from_pretrained("./vit-bert")
42
-
43
-
44
- '''
45
-
46
-
47
 
48
  def self_caption(image):
49
  repo_name = "ydshieh/vit-gpt2-coco-en"
@@ -62,13 +46,8 @@ def self_caption(image):
62
  # autoregressively generate text (using beam search or other decoding strategy)
63
  generated_ids = model2.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True)
64
 
65
-
66
  # decode into text
67
  preds = tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)
68
- #generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
69
-
70
- #return(generated_sentences[0].split('.')[0])
71
-
72
  preds = [pred.strip() for pred in preds]
73
  print("Predictions")
74
  print(preds)
@@ -79,8 +58,9 @@ def self_caption(image):
79
  pred_dictionary = dict(zip(pred_keys, pred_value))
80
  print("Pred dictionary")
81
  print(pred_dictionary)
82
- return(pred_dictionary)
83
- #return(preds[0].split('.')[0])
 
84
 
85
  def classify_image(image):
86
  results = image_pipe(image)
@@ -99,21 +79,18 @@ def classify_image(image):
99
 
100
 
101
  image = gr.inputs.Image(type="pil")
102
- image_piped = ""
103
  label = gr.outputs.Label(num_top_classes=5)
104
  examples = [["cats.jpg"]]
105
  title = "Generate a Story from an Image"
106
- description = "Demo for classifying images with Perceiver IO. To use it, simply upload an image and click 'submit' to let the model predict the 5 most probable ImageNet classes. Results will show up in a few seconds." + image_piped
107
  article = "<p style='text-align: center'></p>"
108
 
109
- #gr.Interface(fn=classify_image, inputs=image, outputs=label, title=title, description=description, examples="", enable_queue=True).launch(debug=True)
110
- print("img_info1")
111
  img_info1 = gr.Interface(
112
  fn=classify_image,
113
  inputs=image,
114
  outputs=label,
115
  )
116
- print("img_info2")
117
  img_info2 = gr.Interface(
118
  fn=self_caption,
119
  inputs=image,
@@ -122,7 +99,7 @@ img_info2 = gr.Interface(
122
  gr.outputs.Textbox(label = 'Caption')
123
  ],
124
  )
125
- print("running parallel call")
126
  Parallel(img_info1,img_info2, inputs=image, title=title, description=description, examples=examples, enable_queue=True).launch(debug=True)
127
  #Parallel(img_info1,img_info2, inputs=image, outputs=label, title=title, description=description, examples=examples, enable_queue=True).launch(debug=True)
128
 
 
10
  from transformers import AutoTokenizer
11
  import torch
12
 
13
+ # https://github.com/NielsRogge/Transformers-Tutorials/blob/master/HuggingFace_vision_ecosystem_overview_(June_2022).ipynb
14
  # option 1: load with randomly initialized weights (train from scratch)
15
 
16
  config = ViTConfig(num_hidden_layers=12, hidden_size=768)
 
28
 
29
  image_pipe = ImageClassificationPipeline(model=model, feature_extractor=feature_extractor)
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def self_caption(image):
33
  repo_name = "ydshieh/vit-gpt2-coco-en"
 
46
  # autoregressively generate text (using beam search or other decoding strategy)
47
  generated_ids = model2.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True)
48
 
 
49
  # decode into text
50
  preds = tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)
 
 
 
 
51
  preds = [pred.strip() for pred in preds]
52
  print("Predictions")
53
  print(preds)
 
58
  pred_dictionary = dict(zip(pred_keys, pred_value))
59
  print("Pred dictionary")
60
  print(pred_dictionary)
61
+ #return(pred_dictionary)
62
+ return preds
63
+
64
 
65
  def classify_image(image):
66
  results = image_pipe(image)
 
79
 
80
 
81
  image = gr.inputs.Image(type="pil")
 
82
  label = gr.outputs.Label(num_top_classes=5)
83
  examples = [["cats.jpg"]]
84
  title = "Generate a Story from an Image"
85
+ description = "Demo for classifying images with Perceiver IO. To use it, simply upload an image and click 'submit', a caption is autogenerated as well"
86
  article = "<p style='text-align: center'></p>"
87
 
 
 
88
  img_info1 = gr.Interface(
89
  fn=classify_image,
90
  inputs=image,
91
  outputs=label,
92
  )
93
+
94
  img_info2 = gr.Interface(
95
  fn=self_caption,
96
  inputs=image,
 
99
  gr.outputs.Textbox(label = 'Caption')
100
  ],
101
  )
102
+
103
  Parallel(img_info1,img_info2, inputs=image, title=title, description=description, examples=examples, enable_queue=True).launch(debug=True)
104
  #Parallel(img_info1,img_info2, inputs=image, outputs=label, title=title, description=description, examples=examples, enable_queue=True).launch(debug=True)
105