Spaces:

nvidia
/

radio

Running

gheinrich commited on Jul 28

Commit

703aea5

•

1 Parent(s): 2420cd1

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -275,7 +275,7 @@ def infer_radio(image):
     masks = sam_generator.generate(image_array)
     overlay = visualize_anns(image_array, masks)
-    return f"{features.shape}", pca_viz, overlay
@@ -284,24 +284,26 @@ title = """RADIO: Reduce All Domains Into One"""
 description = """
 # RADIO
-AM-RADIO is a framework to distill Large Vision Foundation models into a single one.
 RADIO, a new vision foundation model, excels across visual domains, serving as a superior replacement for vision backbones.
 Integrating CLIP variants, DINOv2, and SAM through distillation, it preserves unique features like text grounding and segmentation correspondence.
 Outperforming teachers in ImageNet zero-shot (+6.8%), kNN (+2.39%), and linear probing segmentation (+3.8%) and vision-language models (LLaVa 1.5 up to 1.5%), it scales to any resolution, supports non-square images.
 # Instructions
-Simply paste an image or pick one from the gallery of examples and then click the "Submit" button.
 """
 inputs = [
     gr.Image(type="pil")
 ]
-outputs = [
-    gr.Textbox(label="Feature Shape"),
     gr.Image(label="PCA Feature Visalization"),
     gr.Image(label="SAM Masks"),
 ]
 # Create the Gradio interface

     masks = sam_generator.generate(image_array)
     overlay = visualize_anns(image_array, masks)
+    return pca_viz, overlay, f"{features.shape}"
 description = """
 # RADIO
+[AM-RADIO](https://github.com/NVlabs/RADIO) is a framework to distill Large Vision Foundation models into a single one.
 RADIO, a new vision foundation model, excels across visual domains, serving as a superior replacement for vision backbones.
 Integrating CLIP variants, DINOv2, and SAM through distillation, it preserves unique features like text grounding and segmentation correspondence.
 Outperforming teachers in ImageNet zero-shot (+6.8%), kNN (+2.39%), and linear probing segmentation (+3.8%) and vision-language models (LLaVa 1.5 up to 1.5%), it scales to any resolution, supports non-square images.
 # Instructions
+Paste an image into the input box or pick one from the gallery of examples and then click the "Submit" button.
+The RADIO backbone features are processed with a PCA projection to 3 channels and displayed as an RGB channels.
+The SAM features are processed using the SAM decoder and shown as an overlay on top of the input image.
 """
 inputs = [
     gr.Image(type="pil")
 ]
+outputs = [
     gr.Image(label="PCA Feature Visalization"),
     gr.Image(label="SAM Masks"),
+    gr.Textbox(label="Feature Shape"),
 ]
 # Create the Gradio interface