prithivMLmods
/

DeepCaption-VLA-7B

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ekEpkSW7ocND"
+      },
+      "source": [
+        "## **Multimodal Caption: DeepCaption-VLA-7B(4bit)**"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uFovmijgUV1Z"
+      },
+      "source": [
+        "\n",
+        "\n",
+        "The DeepCaption-VLA-7B model is a fine-tuned version of Qwen2.5-VL-7B-Instruct, tailored for Image Captioning and Vision Language Attribution. This variant is designed to generate precise, highly descriptive captions with a focus on defining visual properties, object attributes, and scene details across a wide spectrum of images and aspect ratios.\n",
+        "\n",
+        "\n",
+        "High-Fidelity Descriptions: Handles general, artistic, technical, abstract, and low-context images with descriptive depth.\n",
+        "\n",
+        "| IMG 1 | IMG 2 |\n",
+        "|-------|-------|\n",
+        "| <img src=\"https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/OOGTz33hb_0oLgx0M-hb-.png\" width=\"300\"/> | <img src=\"https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/JWHWEpElNt0XLL_F5iSHo.png\" width=\"300\"/> |\n",
+        "\n",
+        "\n",
+        "*notebook by : [prithivMLmods](https://huggingface.co/prithivMLmods)*"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RugX4SGZV-8O"
+      },
+      "source": [
+        "### **Installing all necessary packages**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "l-NtFtjSpuJQ"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "!pip install git+https://github.com/huggingface/transformers.git \\\n",
+        "             git+https://github.com/huggingface/accelerate.git \\\n",
+        "             git+https://github.com/huggingface/peft.git \\\n",
+        "             transformers-stream-generator huggingface_hub albumentations \\\n",
+        "             pyvips-binary qwen-vl-utils sentencepiece opencv-python docling-core \\\n",
+        "             python-docx torchvision safetensors matplotlib num2words \\\n",
+        "\n",
+        "!pip install xformers requests pymupdf hf_xet spaces pyvips pillow gradio \\\n",
+        "             einops torch fpdf timm av decord bitsandbytes reportlab\n",
+        "#Hold tight, this will take around 2-3 minutes."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mvoSnRZcVBu4"
+      },
+      "source": [
+        "### **Run DeepCaption-VLA-7B(4bit) Demo**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "tElKr2Fkp1bO"
+      },
+      "outputs": [],
+      "source": [
+        "import gradio as gr\n",
+        "import spaces\n",
+        "from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer, BitsAndBytesConfig\n",
+        "from qwen_vl_utils import process_vision_info\n",
+        "import torch\n",
+        "from PIL import Image\n",
+        "import os\n",
+        "import uuid\n",
+        "import io\n",
+        "from threading import Thread\n",
+        "from reportlab.lib.pagesizes import A4\n",
+        "from reportlab.lib.styles import getSampleStyleSheet\n",
+        "from reportlab.lib import colors\n",
+        "from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer\n",
+        "from reportlab.lib.units import inch\n",
+        "from reportlab.pdfbase import pdfmetrics\n",
+        "from reportlab.pdfbase.ttfonts import TTFont\n",
+        "import docx\n",
+        "from docx.enum.text import WD_ALIGN_PARAGRAPH\n",
+        "\n",
+        "# Define model options\n",
+        "MODEL_OPTIONS = {\n",
+        "    \"DeepCaption-VLA-7B\": \"prithivMLmods/DeepCaption-VLA-7B\",\n",
+        "}\n",
+        "\n",
+        "# Define 4-bit quantization configuration\n",
+        "# This config will load the model in 4-bit to save VRAM.\n",
+        "# You can customize these settings as needed.\n",
+        "quantization_config = BitsAndBytesConfig(\n",
+        "    load_in_4bit=True,\n",
+        "    bnb_4bit_compute_dtype=torch.float16,\n",
+        "    bnb_4bit_quant_type=\"nf4\",\n",
+        "    bnb_4bit_use_double_quant=True,\n",
+        ")\n",
+        "\n",
+        "# Preload models and processors into CUDA\n",
+        "models = {}\n",
+        "processors = {}\n",
+        "for name, model_id in MODEL_OPTIONS.items():\n",
+        "    print(f\"Loading {name}🤗. This will use 4-bit quantization to save VRAM.\")\n",
+        "    models[name] = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
+        "        model_id,\n",
+        "        trust_remote_code=True,\n",
+        "        quantization_config=quantization_config,\n",
+        "        device_map=\"auto\"\n",
+        "    )\n",
+        "    processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)\n",
+        "\n",
+        "image_extensions = Image.registered_extensions()\n",
+        "\n",
+        "def identify_and_save_blob(blob_path):\n",
+        "    \"\"\"Identifies if the blob is an image and saves it.\"\"\"\n",
+        "    try:\n",
+        "        with open(blob_path, 'rb') as file:\n",
+        "            blob_content = file.read()\n",
+        "            try:\n",
+        "                Image.open(io.BytesIO(blob_content)).verify()  # Check if it's a valid image\n",
+        "                extension = \".png\"  # Default to PNG for saving\n",
+        "                media_type = \"image\"\n",
+        "            except (IOError, SyntaxError):\n",
+        "                raise ValueError(\"Unsupported media type. Please upload a valid image.\")\n",
+        "\n",
+        "            filename = f\"temp_{uuid.uuid4()}_media{extension}\"\n",
+        "            with open(filename, \"wb\") as f:\n",
+        "                f.write(blob_content)\n",
+        "\n",
+        "            return filename, media_type\n",
+        "\n",
+        "    except FileNotFoundError:\n",
+        "        raise ValueError(f\"The file {blob_path} was not found.\")\n",
+        "    except Exception as e:\n",
+        "        raise ValueError(f\"An error occurred while processing the file: {e}\")\n",
+        "\n",
+        "@spaces.GPU\n",
+        "def qwen_inference(model_name, media_input, text_input=None):\n",
+        "    \"\"\"Handles inference for the selected model.\"\"\"\n",
+        "    model = models[model_name]\n",
+        "    processor = processors[model_name]\n",
+        "\n",
+        "    if isinstance(media_input, str):\n",
+        "        media_path = media_input\n",
+        "        if media_path.endswith(tuple([i for i in image_extensions.keys()])):\n",
+        "            media_type = \"image\"\n",
+        "        else:\n",
+        "            try:\n",
+        "                media_path, media_type = identify_and_save_blob(media_input)\n",
+        "            except Exception as e:\n",
+        "                raise ValueError(\"Unsupported media type. Please upload a valid image.\")\n",
+        "\n",
+        "    messages = [\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\n",
+        "                    \"type\": media_type,\n",
+        "                    media_type: media_path\n",
+        "                },\n",
+        "                {\"type\": \"text\", \"text\": text_input},\n",
+        "            ],\n",
+        "        }\n",
+        "    ]\n",
+        "\n",
+        "    text = processor.apply_chat_template(\n",
+        "        messages, tokenize=False, add_generation_prompt=True\n",
+        "    )\n",
+        "    image_inputs, _ = process_vision_info(messages)\n",
+        "    inputs = processor(\n",
+        "        text=[text],\n",
+        "        images=image_inputs,\n",
+        "        padding=True,\n",
+        "        return_tensors=\"pt\",\n",
+        "    ).to(\"cuda\")\n",
+        "\n",
+        "    streamer = TextIteratorStreamer(\n",
+        "        processor.tokenizer, skip_prompt=True, skip_special_tokens=True\n",
+        "    )\n",
+        "    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)\n",
+        "\n",
+        "    thread = Thread(target=model.generate, kwargs=generation_kwargs)\n",
+        "    thread.start()\n",
+        "\n",
+        "    buffer = \"\"\n",
+        "    for new_text in streamer:\n",
+        "        buffer += new_text\n",
+        "        # Remove <|im_end|> or similar tokens from the output\n",
+        "        buffer = buffer.replace(\"<|im_end|>\", \"\")\n",
+        "        yield buffer\n",
+        "\n",
+        "def format_plain_text(output_text):\n",
+        "    \"\"\"Formats the output text as plain text without LaTeX delimiters.\"\"\"\n",
+        "    # Remove LaTeX delimiters and convert to plain text\n",
+        "    plain_text = output_text.replace(\"\\\\(\", \"\").replace(\"\\\\)\", \"\").replace(\"\\\\[\", \"\").replace(\"\\\\]\", \"\")\n",
+        "    return plain_text\n",
+        "\n",
+        "def generate_document(media_path, output_text, file_format, font_size, line_spacing, alignment, image_size):\n",
+        "    \"\"\"Generates a document with the input image and plain text output.\"\"\"\n",
+        "    plain_text = format_plain_text(output_text)\n",
+        "    if file_format == \"pdf\":\n",
+        "        return generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size)\n",
+        "    elif file_format == \"docx\":\n",
+        "        return generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size)\n",
+        "\n",
+        "def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size):\n",
+        "    \"\"\"Generates a PDF document.\"\"\"\n",
+        "    filename = f\"output_{uuid.uuid4()}.pdf\"\n",
+        "    doc = SimpleDocTemplate(\n",
+        "        filename,\n",
+        "        pagesize=A4,\n",
+        "        rightMargin=inch,\n",
+        "        leftMargin=inch,\n",
+        "        topMargin=inch,\n",
+        "        bottomMargin=inch\n",
+        "    )\n",
+        "    styles = getSampleStyleSheet()\n",
+        "    styles[\"Normal\"].fontSize = int(font_size)\n",
+        "    styles[\"Normal\"].leading = int(font_size) * line_spacing\n",
+        "    styles[\"Normal\"].alignment = {\n",
+        "        \"Left\": 0,\n",
+        "        \"Center\": 1,\n",
+        "        \"Right\": 2,\n",
+        "        \"Justified\": 4\n",
+        "    }[alignment]\n",
+        "\n",
+        "    story = []\n",
+        "\n",
+        "    # Add image with size adjustment\n",
+        "    image_sizes = {\n",
+        "        \"Small\": (200, 200),\n",
+        "        \"Medium\": (400, 400),\n",
+        "        \"Large\": (600, 600)\n",
+        "    }\n",
+        "    img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])\n",
+        "    story.append(img)\n",
+        "    story.append(Spacer(1, 12))\n",
+        "\n",
+        "    # Add plain text output\n",
+        "    text = Paragraph(plain_text, styles[\"Normal\"])\n",
+        "    story.append(text)\n",
+        "\n",
+        "    doc.build(story)\n",
+        "    return filename\n",
+        "\n",
+        "def generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size):\n",
+        "    \"\"\"Generates a DOCX document.\"\"\"\n",
+        "    filename = f\"output_{uuid.uuid4()}.docx\"\n",
+        "    doc = docx.Document()\n",
+        "\n",
+        "    # Add image with size adjustment\n",
+        "    image_sizes = {\n",
+        "        \"Small\": docx.shared.Inches(2),\n",
+        "        \"Medium\": docx.shared.Inches(4),\n",
+        "        \"Large\": docx.shared.Inches(6)\n",
+        "    }\n",
+        "    doc.add_picture(media_path, width=image_sizes[image_size])\n",
+        "    doc.add_paragraph()\n",
+        "\n",
+        "    # Add plain text output\n",
+        "    paragraph = doc.add_paragraph()\n",
+        "    paragraph.paragraph_format.line_spacing = line_spacing\n",
+        "    paragraph.paragraph_format.alignment = {\n",
+        "        \"Left\": WD_ALIGN_PARAGRAPH.LEFT,\n",
+        "        \"Center\": WD_ALIGN_PARAGRAPH.CENTER,\n",
+        "        \"Right\": WD_ALIGN_PARAGRAPH.RIGHT,\n",
+        "        \"Justified\": WD_ALIGN_PARAGRAPH.JUSTIFY\n",
+        "    }[alignment]\n",
+        "    run = paragraph.add_run(plain_text)\n",
+        "    run.font.size = docx.shared.Pt(int(font_size))\n",
+        "\n",
+        "    doc.save(filename)\n",
+        "    return filename\n",
+        "\n",
+        "# CSS for output styling\n",
+        "css = \"\"\"\n",
+        "  #output {\n",
+        "    height: 500px;\n",
+        "    overflow: auto;\n",
+        "    border: 1px solid #ccc;\n",
+        "  }\n",
+        ".submit-btn {\n",
+        "    background-color: #cf3434  !important;\n",
+        "    color: white !important;\n",
+        "}\n",
+        ".submit-btn:hover {\n",
+        "    background-color: #ff2323 !important;\n",
+        "}\n",
+        ".download-btn {\n",
+        "    background-color: #35a6d6 !important;\n",
+        "    color: white !important;\n",
+        "}\n",
+        ".download-btn:hover {\n",
+        "    background-color: #22bcff !important;\n",
+        "}\n",
+        "\"\"\"\n",
+        "\n",
+        "# Gradio app setup\n",
+        "with gr.Blocks(css=css, theme=\"bethecloud/storj_theme\") as demo:\n",
+        "    gr.Markdown(\"# **Multimodal-Caption : DeepCaption-VLA-7B**\")\n",
+        "\n",
+        "    with gr.Tab(label=\"Image Input\"):\n",
+        "\n",
+        "        with gr.Row():\n",
+        "            with gr.Column():\n",
+        "                model_choice = gr.Dropdown(\n",
+        "                    label=\"Model Selection\",\n",
+        "                    choices=list(MODEL_OPTIONS.keys()),\n",
+        "                    value=\"DeepCaption-VLA-7B\"\n",
+        "                )\n",
+        "                input_media = gr.File(\n",
+        "                    label=\"Upload Image\", type=\"filepath\"\n",
+        "                )\n",
+        "                text_input = gr.Textbox(label=\"Question\", value=\"Caption the image precisely.\")\n",
+        "                submit_btn = gr.Button(value=\"Submit\", elem_classes=\"submit-btn\")\n",
+        "\n",
+        "            with gr.Column():\n",
+        "                output_text = gr.Textbox(label=\"Output Text\", lines=7)\n",
+        "\n",
+        "                with gr.Accordion(\"Plain Text\", open=False):\n",
+        "                  plain_text_output = gr.Textbox(label=\"Standardized Plain Text\", lines=10)\n",
+        "\n",
+        "        submit_btn.click(\n",
+        "            qwen_inference, [model_choice, input_media, text_input], [output_text]\n",
+        "        ).then(\n",
+        "            lambda output_text: format_plain_text(output_text), [output_text], [plain_text_output]\n",
+        "        )\n",
+        "\n",
+        "        with gr.Accordion(\"Docx/PDF Settings\", open=False):\n",
+        "          with gr.Row():\n",
+        "              with gr.Column():\n",
+        "                  line_spacing = gr.Dropdown(\n",
+        "                      choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0],\n",
+        "                      value=1.5,\n",
+        "                      label=\"Line Spacing\"\n",
+        "                  )\n",
+        "                  font_size = gr.Dropdown(\n",
+        "                      choices=[\"8\", \"10\", \"12\", \"14\", \"16\", \"18\", \"20\", \"22\", \"24\"],\n",
+        "                      value=\"16\",\n",
+        "                      label=\"Font Size\"\n",
+        "                  )\n",
+        "                  alignment = gr.Dropdown(\n",
+        "                      choices=[\"Left\", \"Center\", \"Right\", \"Justified\"],\n",
+        "                      value=\"Justified\",\n",
+        "                      label=\"Text Alignment\"\n",
+        "                  )\n",
+        "                  image_size = gr.Dropdown(\n",
+        "                      choices=[\"Small\", \"Medium\", \"Large\"],\n",
+        "                      value=\"Medium\",\n",
+        "                      label=\"Image Size\"\n",
+        "                  )\n",
+        "                  file_format = gr.Radio([\"pdf\", \"docx\"], label=\"File Format\", value=\"pdf\")\n",
+        "\n",
+        "        get_document_btn = gr.Button(value=\"Get Document\", elem_classes=\"download-btn\")\n",
+        "\n",
+        "        get_document_btn.click(\n",
+        "            generate_document, [input_media, output_text, file_format, font_size, line_spacing, alignment, image_size], gr.File(label=\"Download Document\")\n",
+        "        )\n",
+        "\n",
+        "demo.launch(debug=True)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}