Spaces:

andsteing
/

lit-demo-bv

Sleeping

App Files Files Community

andsteing commited on Mar 27

Commit

3cfc2e7

•

1 Parent(s): bc8a162

Reformatted code a bit.

Browse files

Files changed (3) hide show

app.py +57 -35
big_vision_contrastive_models.py +32 -17
gradio_helpers.py +7 -3

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ import urllib.request
 import gradio as gr
 import PIL.Image
 import big_vision_contrastive_models as models
 import gradio_helpers
@@ -37,26 +38,26 @@ LOADING_SECS = {'B/16': 5, 'L/16': 10, 'So400m/14': 10}
 MODEL_MAP = {
     'lit': {
         'B/16': {
-          224: 'lit_b16b',
         },
         'L/16': {
-          224: 'lit_l16l',
         },
     },
     'siglip': {
         'B/16': {
-          224: 'siglip_b16b_224',
-          256: 'siglip_b16b_256',
-          384: 'siglip_b16b_384',
-          512: 'siglip_b16b_512',
         },
         'L/16': {
-          256: 'siglip_l16l_256',
-          384: 'siglip_l16l_384',
         },
         'So400m/14': {
-          224: 'siglip_so400m14so440m_224',
-          384: 'siglip_so400m14so440m_384',
         },
     },
 }
@@ -72,7 +73,9 @@ def get_cache_status():
   )
-def compute(image_path, prompts, family, variant, res, bias, progress=gr.Progress()):
   """Loads model and computes answers."""
   if image_path is None:
@@ -83,7 +86,7 @@ def compute(image_path, prompts, family, variant, res, bias, progress=gr.Progres
   model_name = MODEL_MAP[family][variant][res]
   config = models.MODEL_CONFIGS[model_name]
   local_ckpt = gradio_helpers.get_disk_cache(
-    config.ckpt, progress=progress, max_cache_size_bytes=MAX_DISK_CACHE)
   config = dataclasses.replace(config, ckpt=local_ckpt)
   params, model = gradio_helpers.get_memory_cache(
       config,
@@ -91,11 +94,11 @@ def compute(image_path, prompts, family, variant, res, bias, progress=gr.Progres
       max_cache_size_bytes=MAX_RAM_CACHE,
       progress=progress,
       estimated_secs={
-        ('lit', 'B/16'): 1,
-        ('lit', 'L/16'): 2.5,
-        ('siglip', 'B/16'): 9,
-        ('siglip', 'L/16'): 28,
-        ('siglip', 'So400m/14'): 36,
       }.get((family, variant))
   )
   model: models.ContrastiveModel = model
@@ -107,18 +110,19 @@ def compute(image_path, prompts, family, variant, res, bias, progress=gr.Progres
     image = PIL.Image.open(image_path)
     next(it)
   with gradio_helpers.timed('image features'):
-    zimg, out = model.embed_images(
         params, model.preprocess_images([image])
     )
     next(it)
   with gradio_helpers.timed('text features'):
     prompts = prompts.split('\n')
     ztxt, out = model.embed_texts(
-      params, model.preprocess_texts(prompts)
     )
     next(it)
   t = model.get_temperature(out)
   if family == 'lit':
     text_probs = list(model.get_probabilities(zimg, ztxt, t, axis=-1)[0])
   elif family == 'siglip':
@@ -140,7 +144,8 @@ def update_answers(state):
   """Generates visible sliders for answers."""
   answers = []
   for prompt, prob in state[:MAX_ANSWERS]:
-    answers.append(gr.Slider(value=round(100*prob, 2), label=prompt, visible=True))
   while len(answers) < MAX_ANSWERS:
     answers.append(gr.Slider(visible=False))
   return answers
@@ -159,7 +164,10 @@ def create_app():
   with gr.Blocks(css=css) as demo:
-    gr.Markdown('Gradio clone of the original [LiT demo](https://google-research.github.io/vision_transformer/lit/).')
     status = gr.Markdown(f'Ready ({get_cache_status()})')
@@ -168,12 +176,14 @@ def create_app():
       source = gr.Markdown('', visible=False)
       state = gr.State([])
       with gr.Column():
-        prompts = gr.Textbox(label='Prompts (press Shift-ENTER to add a prompt)')
         with gr.Row():
           values = {}
-          family = gr.Dropdown(value='lit', choices=list(MODEL_MAP), label='Model family')
           values['family'] = family.value
           # Unfortunately below reactive UI code is a bit convoluted, because:
@@ -185,25 +195,34 @@ def create_app():
           def make_variant(family_value):
             choices = list(MODEL_MAP[family_value])
             values['variant'] = choices[0]
-            return gr.Dropdown(value=values['variant'], choices=choices, label='Variant')
           variant = make_variant(family.value)
           def make_res(family, variant):
             choices = list(MODEL_MAP[family][variant])
             values['res'] = choices[0]
-            return gr.Dropdown(value=values['res'], choices=choices, label='Resolution')
           res = make_res(family.value, variant.value)
           values['res'] = res.value
           def make_bias(family, variant, res):
             visible = family == 'siglip'
             value = {
-              ('siglip', 'B/16', 224): -12.9,
-              ('siglip', 'L/16', 256): -12.7,
-              ('siglip', 'L/16', 256): -16.5,
-              # ...
             }.get((family, variant, res), -10.0)
-            return gr.Slider(value=value, minimum=-20, maximum=0, step=0.05, label='Bias', visible=visible)
           bias = make_bias(family.value, variant.value, res.value)
           values['bias'] = bias.value
@@ -248,7 +267,10 @@ def create_app():
         # a single `status` widget here, and store the computed information in
         # `state`...
         run.click(
-            fn=compute, inputs=[image, prompts, family, variant, res, bias], outputs=[status, state])
         # ... then we use `state` to update UI components without showing a
         # progress bar in their place.
         status.change(fn=update_answers, inputs=state, outputs=answers)
@@ -258,9 +280,9 @@ def create_app():
     gr.Examples(
         examples=[
             [
-              IMG_URL_FMT.format(ex['id']),
-              ex['prompts'].replace(', ', '\n'),
-              '[source](%s)' % ex['source'],
             ]
             for ex in info
         ],
@@ -272,7 +294,7 @@ def create_app():
   return demo
-if __name__ == "__main__":
   logging.basicConfig(level=logging.INFO,
                       format='%(asctime)s - %(levelname)s - %(message)s')

 import gradio as gr
 import PIL.Image
+# pylint: disable=g-bad-import-order
 import big_vision_contrastive_models as models
 import gradio_helpers
 MODEL_MAP = {
     'lit': {
         'B/16': {
+            224: 'lit_b16b',
         },
         'L/16': {
+            224: 'lit_l16l',
         },
     },
     'siglip': {
         'B/16': {
+            224: 'siglip_b16b_224',
+            256: 'siglip_b16b_256',
+            384: 'siglip_b16b_384',
+            512: 'siglip_b16b_512',
         },
         'L/16': {
+            256: 'siglip_l16l_256',
+            384: 'siglip_l16l_384',
         },
         'So400m/14': {
+            224: 'siglip_so400m14so440m_224',
+            384: 'siglip_so400m14so440m_384',
         },
     },
 }
   )
+def compute(
+    image_path, prompts, family, variant, res, bias, progress=gr.Progress()
+):
   """Loads model and computes answers."""
   if image_path is None:
   model_name = MODEL_MAP[family][variant][res]
   config = models.MODEL_CONFIGS[model_name]
   local_ckpt = gradio_helpers.get_disk_cache(
+      config.ckpt, progress=progress, max_cache_size_bytes=MAX_DISK_CACHE)
   config = dataclasses.replace(config, ckpt=local_ckpt)
   params, model = gradio_helpers.get_memory_cache(
       config,
       max_cache_size_bytes=MAX_RAM_CACHE,
       progress=progress,
       estimated_secs={
+          ('lit', 'B/16'): 1,
+          ('lit', 'L/16'): 2.5,
+          ('siglip', 'B/16'): 9,
+          ('siglip', 'L/16'): 28,
+          ('siglip', 'So400m/14'): 36,
       }.get((family, variant))
   )
   model: models.ContrastiveModel = model
     image = PIL.Image.open(image_path)
     next(it)
   with gradio_helpers.timed('image features'):
+    zimg, unused_out = model.embed_images(
         params, model.preprocess_images([image])
     )
     next(it)
   with gradio_helpers.timed('text features'):
     prompts = prompts.split('\n')
     ztxt, out = model.embed_texts(
+        params, model.preprocess_texts(prompts)
     )
     next(it)
   t = model.get_temperature(out)
+  text_probs = []
   if family == 'lit':
     text_probs = list(model.get_probabilities(zimg, ztxt, t, axis=-1)[0])
   elif family == 'siglip':
   """Generates visible sliders for answers."""
   answers = []
   for prompt, prob in state[:MAX_ANSWERS]:
+    answers.append(
+        gr.Slider(value=round(100*prob, 2), label=prompt, visible=True))
   while len(answers) < MAX_ANSWERS:
     answers.append(gr.Slider(visible=False))
   return answers
   with gr.Blocks(css=css) as demo:
+    gr.Markdown(
+        'Gradio clone of the original '
+        '[LiT demo](https://google-research.github.io/vision_transformer/lit/).'
+    )
     status = gr.Markdown(f'Ready ({get_cache_status()})')
       source = gr.Markdown('', visible=False)
       state = gr.State([])
       with gr.Column():
+        prompts = gr.Textbox(
+            label='Prompts (press Shift-ENTER to add a prompt)')
         with gr.Row():
           values = {}
+          family = gr.Dropdown(
+              value='lit', choices=list(MODEL_MAP), label='Model family')
           values['family'] = family.value
           # Unfortunately below reactive UI code is a bit convoluted, because:
           def make_variant(family_value):
             choices = list(MODEL_MAP[family_value])
             values['variant'] = choices[0]
+            return gr.Dropdown(
+                value=values['variant'], choices=choices, label='Variant')
           variant = make_variant(family.value)
           def make_res(family, variant):
             choices = list(MODEL_MAP[family][variant])
             values['res'] = choices[0]
+            return gr.Dropdown(
+                value=values['res'], choices=choices, label='Resolution')
           res = make_res(family.value, variant.value)
           values['res'] = res.value
           def make_bias(family, variant, res):
             visible = family == 'siglip'
             value = {
+                ('siglip', 'B/16', 224): -12.9,
+                ('siglip', 'L/16', 256): -12.7,
+                ('siglip', 'L/16', 256): -16.5,
+                # ...
             }.get((family, variant, res), -10.0)
+            return gr.Slider(
+                value=value,
+                minimum=-20,
+                maximum=0,
+                step=0.05,
+                label='Bias',
+                visible=visible,
+            )
           bias = make_bias(family.value, variant.value, res.value)
           values['bias'] = bias.value
         # a single `status` widget here, and store the computed information in
         # `state`...
         run.click(
+            fn=compute,
+            inputs=[image, prompts, family, variant, res, bias],
+            outputs=[status, state],
+        )
         # ... then we use `state` to update UI components without showing a
         # progress bar in their place.
         status.change(fn=update_answers, inputs=state, outputs=answers)
     gr.Examples(
         examples=[
             [
+                IMG_URL_FMT.format(ex['id']),
+                ex['prompts'].replace(', ', '\n'),
+                '[source](%s)' % ex['source'],
             ]
             for ex in info
         ],
   return demo
+if __name__ == '__main__':
   logging.basicConfig(level=logging.INFO,
                       format='%(asctime)s - %(levelname)s - %(message)s')

big_vision_contrastive_models.py CHANGED Viewed

@@ -27,15 +27,17 @@ import transformers
 def _clone_git(url, destination_folder, commit_hash=None):
-    subprocess.run([
-       'git', 'clone', '--depth=1',
-       url, destination_folder
-    ], check=True)
-    if commit_hash:
-      subprocess.run(['git', '-C', destination_folder, 'checkout', commit_hash], check=True)
 def setup(commit_hash=None):
   for url, dst_name in (
       ('https://github.com/google-research/big_vision', 'big_vision_repo'),
       ('https://github.com/google/flaxformer', 'flaxformer_repo'),
@@ -43,11 +45,12 @@ def setup(commit_hash=None):
     dst_path = os.path.join(tempfile.gettempdir(), dst_name)
     if not os.path.exists(dst_path):
       _clone_git(url, dst_path, commit_hash)
-    if not dst_path in sys.path:
       sys.path.insert(0, dst_path)
 class ContrastiveModelFamily(enum.Enum):
   LIT = 'lit'
   SIGLIP = 'siglip'
@@ -96,18 +99,21 @@ class ContrastiveModel:
     return ztxt, out
   def preprocess_texts(self, texts):
     def tokenize_pad(text, seqlen=self.config.seqlen):
       if self.config.family == ContrastiveModelFamily.LIT:
-        tokens = self.tokenizer_bert.encode(text, add_special_tokens=True)[:-1]  # removes [SEP]
         tokens = tokens[:seqlen]
         return tokens + [0] * (seqlen - len(tokens))
       if self.config.family == ContrastiveModelFamily.SIGLIP:
         tokens = self.tokenizer_sp.tokenize(text, add_eos=True)
         if len(tokens) >= seqlen:
-          return tokens[:seqlen - 1] + [tok.eos_id()]  # "sticky" eos
         return tokens + [0] * (seqlen - len(tokens))
     return np.array([tokenize_pad(text) for text in texts])
@@ -125,7 +131,9 @@ class ContrastiveModel:
     ]) / 127.5 - 1.0
   def get_bias(self, out):
-    assert self.config.family == ContrastiveModelFamily.SIGLIP, self.config.family
     return out['b'].item()
   def get_temperature(self, out):
@@ -145,7 +153,9 @@ class ContrastiveModel:
       return jax.nn.sigmoid(zimg @ ztxt.T * temperature + bias)
-def _make_config(family, variant, res, textvariant, ckpt, embdim, seqlen, vocab_size):
   if family == 'lit':
     tokenizer = ckpt.replace('.npz', '.txt')
   else:
@@ -153,11 +163,12 @@ def _make_config(family, variant, res, textvariant, ckpt, embdim, seqlen, vocab_
   return ContrastiveModelConfig(
       family=ContrastiveModelFamily(family), variant=variant, res=res,
       textvariant=textvariant, embdim=embdim, seqlen=seqlen,
-      tokenizer=tokenizer, vocab_size=32_000,
       ckpt=ckpt,
   )
 MODEL_CONFIGS = dict(
     lit_b16b=_make_config('lit', 'B/16', 224, 'B', 'gs://vit_models/lit/LiT-B16B.npz', 768, 16, 32_000),
     lit_l16l=_make_config('lit', 'L/16', 224, 'L', 'gs://vit_models/lit/LiT-L16L.npz', 1024, 16, 32_000),
@@ -173,6 +184,7 @@ MODEL_CONFIGS = dict(
     siglip_so400m14so440m_224=_make_config('siglip', 'So400m/14', 224, 'So400m', 'gs://big_vision/siglip/webli_en_so400m_224_57633886.npz', 1152, 16, 32_000),
     siglip_so400m14so400m_384=_make_config('siglip', 'So400m/14', 384, 'So400m', 'gs://big_vision/siglip/webli_en_so400m_384_58765454.npz', 1152, 64, 32_000),
 )
 @functools.cache
@@ -187,7 +199,6 @@ def load_tokenizer_sp(name_or_path):
 @functools.cache
 def load_tokenizer_bert(path):
-  tok = sentencepiece.SentencePieceProcessor()
   if path.startswith('gs://'):
     dst = tempfile.mktemp()
     gfile.copy(path, dst)
@@ -203,7 +214,9 @@ def load_model(config, check_params=False):
   cfg.image_model = 'vit'  # TODO(lbeyer): remove later, default
   if config.family == ContrastiveModelFamily.LIT:
     cfg.text_model = 'proj.flaxformer.bert'
-    cfg.image = dict(variant=config.variant, pool_type='tok',  head_zeroinit=False)
     bert_config = {'B': 'base', 'L': 'large'}[config.textvariant]
     cfg.text = dict(config=bert_config, head_zeroinit=False)
     tokenizer_bert = load_tokenizer_bert(config.tokenizer)
@@ -211,10 +224,12 @@ def load_model(config, check_params=False):
     if config.variant == 'L/16':
       cfg.out_dim = (None, config.embdim)  # (image_out_dim, text_out_dim)
     else:
-      cfg.out_dim = (config.embdim, config.embdim)  # (image_out_dim, text_out_dim)
   else:
     cfg.image = dict(variant=config.variant, pool_type='map')
-    cfg.text_model = 'proj.image_text.text_transformer'  # TODO(lbeyer): remove later, default
     cfg.text = dict(variant=config.textvariant, vocab_size=config.vocab_size)
     cfg.bias_init = -10.0
     tokenizer_sp = load_tokenizer_sp(config.tokenizer)
@@ -223,7 +238,7 @@ def load_model(config, check_params=False):
   cfg.temperature_init = 10.0
   model_mod = importlib.import_module(
-    'big_vision.models.proj.image_text.two_towers')
   model = model_mod.Model(**cfg)
   init_params = None  # Faster but bypasses loading sanity-checks.

 def _clone_git(url, destination_folder, commit_hash=None):
+  subprocess.run(
+      ['git', 'clone', '--depth=1', url, destination_folder], check=True
+  )
+  if commit_hash:
+    subprocess.run(
+        ['git', '-C', destination_folder, 'checkout', commit_hash], check=True
+    )
 def setup(commit_hash=None):
+  """Checks out required non-pypi code from Github."""
   for url, dst_name in (
       ('https://github.com/google-research/big_vision', 'big_vision_repo'),
       ('https://github.com/google/flaxformer', 'flaxformer_repo'),
     dst_path = os.path.join(tempfile.gettempdir(), dst_name)
     if not os.path.exists(dst_path):
       _clone_git(url, dst_path, commit_hash)
+    if dst_path not in sys.path:
       sys.path.insert(0, dst_path)
 class ContrastiveModelFamily(enum.Enum):
+  """Defines a contrastive model family."""
   LIT = 'lit'
   SIGLIP = 'siglip'
     return ztxt, out
   def preprocess_texts(self, texts):
+    """Converts texts to padded tokens."""
     def tokenize_pad(text, seqlen=self.config.seqlen):
       if self.config.family == ContrastiveModelFamily.LIT:
+        tokens = self.tokenizer_bert.encode(text, add_special_tokens=True)
+        tokens = tokens[:-1]  # removes [SEP]
         tokens = tokens[:seqlen]
         return tokens + [0] * (seqlen - len(tokens))
       if self.config.family == ContrastiveModelFamily.SIGLIP:
         tokens = self.tokenizer_sp.tokenize(text, add_eos=True)
         if len(tokens) >= seqlen:
+          eos_id = self.tokenizer_sp.eos_id()
+          return tokens[:seqlen - 1] + [eos_id]  # "sticky" eos
         return tokens + [0] * (seqlen - len(tokens))
     return np.array([tokenize_pad(text) for text in texts])
     ]) / 127.5 - 1.0
   def get_bias(self, out):
+    assert (
+        self.config.family == ContrastiveModelFamily.SIGLIP
+    ), self.config.family
     return out['b'].item()
   def get_temperature(self, out):
       return jax.nn.sigmoid(zimg @ ztxt.T * temperature + bias)
+def _make_config(
+    family, variant, res, textvariant, ckpt, embdim, seqlen, vocab_size
+):
   if family == 'lit':
     tokenizer = ckpt.replace('.npz', '.txt')
   else:
   return ContrastiveModelConfig(
       family=ContrastiveModelFamily(family), variant=variant, res=res,
       textvariant=textvariant, embdim=embdim, seqlen=seqlen,
+      tokenizer=tokenizer, vocab_size=vocab_size,
       ckpt=ckpt,
   )
+# pylint: disable=line-too-long
 MODEL_CONFIGS = dict(
     lit_b16b=_make_config('lit', 'B/16', 224, 'B', 'gs://vit_models/lit/LiT-B16B.npz', 768, 16, 32_000),
     lit_l16l=_make_config('lit', 'L/16', 224, 'L', 'gs://vit_models/lit/LiT-L16L.npz', 1024, 16, 32_000),
     siglip_so400m14so440m_224=_make_config('siglip', 'So400m/14', 224, 'So400m', 'gs://big_vision/siglip/webli_en_so400m_224_57633886.npz', 1152, 16, 32_000),
     siglip_so400m14so400m_384=_make_config('siglip', 'So400m/14', 384, 'So400m', 'gs://big_vision/siglip/webli_en_so400m_384_58765454.npz', 1152, 64, 32_000),
 )
+# pylint: enable=line-too-long
 @functools.cache
 @functools.cache
 def load_tokenizer_bert(path):
   if path.startswith('gs://'):
     dst = tempfile.mktemp()
     gfile.copy(path, dst)
   cfg.image_model = 'vit'  # TODO(lbeyer): remove later, default
   if config.family == ContrastiveModelFamily.LIT:
     cfg.text_model = 'proj.flaxformer.bert'
+    cfg.image = dict(
+        variant=config.variant, pool_type='tok', head_zeroinit=False
+    )
     bert_config = {'B': 'base', 'L': 'large'}[config.textvariant]
     cfg.text = dict(config=bert_config, head_zeroinit=False)
     tokenizer_bert = load_tokenizer_bert(config.tokenizer)
     if config.variant == 'L/16':
       cfg.out_dim = (None, config.embdim)  # (image_out_dim, text_out_dim)
     else:
+      # (image_out_dim, text_out_dim)
+      cfg.out_dim = (config.embdim, config.embdim)
   else:
     cfg.image = dict(variant=config.variant, pool_type='map')
+    # TODO(lbeyer): remove later, default
+    cfg.text_model = 'proj.image_text.text_transformer'
     cfg.text = dict(variant=config.textvariant, vocab_size=config.vocab_size)
     cfg.bias_init = -10.0
     tokenizer_sp = load_tokenizer_sp(config.tokenizer)
   cfg.temperature_init = 10.0
   model_mod = importlib.import_module(
+      'big_vision.models.proj.image_text.two_towers')
   model = model_mod.Model(**cfg)
   init_params = None  # Faster but bypasses loading sanity-checks.

gradio_helpers.py CHANGED Viewed

@@ -30,8 +30,9 @@ def timed(name):
     logging.info('Timed %s: %.1f secs', name, timing['secs'])
-def copy_file(src, dst, *, progress=None, block_size=1024 * 1024 * 10, overwrite=False):
   """Copies a file with progress bar.
   Args:
@@ -39,6 +40,7 @@ def copy_file(src, dst, *, progress=None, block_size=1024 * 1024 * 10, overwrite
     dst: Destination file. Path must be readable by `tf.io.gfile`.
     progress: An object with a `.tqdm` attribute, or `None`.
     block_size: Size of individual blocks to be read/written.
   """
   if os.path.dirname(dst):
     os.makedirs(os.path.dirname(dst), exist_ok=True)
@@ -87,7 +89,9 @@ def _get_array_sizes(tree):
   return [getattr(x, 'nbytes', 0) for x in jax.tree_leaves(tree)]
-def get_memory_cache(key, getter, max_cache_size_bytes, progress=None, estimated_secs=None):
   """Keeps cache below specified size by removing elements not last accessed."""
   if key in _memory_cache:
     _memory_cache[key] = _memory_cache.pop(key)  # updated "last accessed" order

     logging.info('Timed %s: %.1f secs', name, timing['secs'])
+def copy_file(
+    src, dst, *, progress=None, block_size=1024 * 1024 * 10, overwrite=False
+):
   """Copies a file with progress bar.
   Args:
     dst: Destination file. Path must be readable by `tf.io.gfile`.
     progress: An object with a `.tqdm` attribute, or `None`.
     block_size: Size of individual blocks to be read/written.
+    overwrite: If `True`, overwrite `dst` if it exists.
   """
   if os.path.dirname(dst):
     os.makedirs(os.path.dirname(dst), exist_ok=True)
   return [getattr(x, 'nbytes', 0) for x in jax.tree_leaves(tree)]
+def get_memory_cache(
+    key, getter, max_cache_size_bytes, progress=None, estimated_secs=None
+):
   """Keeps cache below specified size by removing elements not last accessed."""
   if key in _memory_cache:
     _memory_cache[key] = _memory_cache.pop(key)  # updated "last accessed" order