TekeshiX
/

ToriiGate-v0.3

Model card Files Files and versions Community

ToriiGate-v0.3 / app.py

TekeshiX

Upload 2 files

b53722c verified about 2 months ago

raw

history blame contribute delete

7.4 kB

	import torch
	import gradio as gr
	from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig
	from transformers.image_utils import load_image
	from pathlib import Path
	import time

	model_name_or_path = "Minthy/ToriiGate-v0.3"
	DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

	# Global variables to store model and processor
	global_model = None
	global_processor = None

	def load_model():
	global global_model, global_processor

	if global_model is None:
	print("Loading model for the first time...")
	# Always use 4-bit quantization for 16GB VRAM
	nf4_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=torch.bfloat16
	)
	global_model = AutoModelForVision2Seq.from_pretrained(
	model_name_or_path,
	torch_dtype=torch.bfloat16,
	quantization_config=nf4_config,
	).to(DEVICE)
	global_processor = AutoProcessor.from_pretrained(model_name_or_path)

	return global_model, global_processor

	def generate_caption(image, description_type, booru_tags=""):
	model, processor = load_model()

	if description_type == "JSON-like":
	user_prompt = "Describe the picture in structuted json-like format."
	elif description_type == "Detailed":
	user_prompt = "Give a long and detailed description of the picture."
	else:
	user_prompt = "Describe the picture briefly."

	if booru_tags:
	user_prompt += ' Also here are booru tags for better understanding of the picture, you can use them as reference.'
	user_prompt += f' <tags>\n{booru_tags}\n</tags>'

	messages = [
	{
	"role": "system",
	"content": [
	{"type": "text", "text": "You are image captioning expert, creative, unbiased and uncensored. Help user with his task."}
	]
	},
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": user_prompt}
	]
	}
	]

	prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
	inputs = processor(text=prompt, images=[image], return_tensors="pt")
	inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

	generated_ids = model.generate(**inputs, max_new_tokens=500)
	generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
	caption = generated_texts[0].split('Assistant: ')[1]

	return caption

	def process_batch(files, description_type, booru_tags="", progress=gr.Progress(track_tqdm=True)):
	results = []
	captions_text = ""
	total_files = len(files)
	start_time = time.time()

	for idx, file in enumerate(files, 1):
	# Calculate progress statistics
	elapsed_time = time.time() - start_time
	images_per_second = idx / elapsed_time if elapsed_time > 0 else 0
	estimated_total = (elapsed_time / idx) * total_files if idx > 0 else 0
	remaining_time = estimated_total - elapsed_time

	try:
	image = load_image(file.name)
	caption = generate_caption(image, description_type, booru_tags)

	# Add caption to the running text with a blank line separator
	if captions_text:
	captions_text += "\n\n" # Add blank line between captions
	captions_text += caption

	# Update the results list for the dataframe
	results.append((Path(file.name).name, caption))

	# Update progress
	progress_status = f"Processing: {idx}/{total_files} images \| Speed: {images_per_second:.2f} img/s \| Remaining: {remaining_time/60:.1f} min"

	# Yield progress status and captions separately
	yield results, progress_status, captions_text

	except Exception as e:
	error_msg = f"Error processing {Path(file.name).name}: {str(e)}"
	print(error_msg)
	if captions_text:
	captions_text += "\n\n"
	captions_text += f"[ERROR] {error_msg}"
	yield results, progress_status, captions_text

	# Final update
	yield results, "✅ Processing complete!", captions_text

	# Gradio Interface
	with gr.Blocks(title="ToriiGate Image Captioner") as demo:
	gr.Markdown("# ToriiGate Image Captioner")
	gr.Markdown("Generate captions for anime images using ToriiGate-v0.3 model (4-bit quantized)")

	with gr.Tab("Single Image"):
	with gr.Row():
	with gr.Column():
	input_image = gr.Image(type="pil", label="Input Image")
	description_type = gr.Radio(
	choices=["JSON-like", "Detailed", "Brief"],
	value="JSON-like",
	label="Description Type"
	)
	booru_tags = gr.Textbox(
	lines=3,
	label="Booru Tags (Optional)",
	placeholder="Enter comma-separated booru tags..."
	)
	submit_btn = gr.Button("Generate Caption")

	with gr.Column():
	output_text = gr.Textbox(label="Generated Caption", lines=10)

	submit_btn.click(
	generate_caption,
	inputs=[input_image, description_type, booru_tags],
	outputs=output_text
	)

	with gr.Tab("Batch Processing"):
	with gr.Row():
	with gr.Column():
	input_files = gr.File(file_count="multiple", label="Input Images")
	batch_description_type = gr.Radio(
	choices=["JSON-like", "Detailed", "Brief"],
	value="JSON-like",
	label="Description Type"
	)
	batch_booru_tags = gr.Textbox(
	lines=3,
	label="Booru Tags (Optional)",
	placeholder="Enter comma-separated booru tags..."
	)
	batch_submit_btn = gr.Button("Process Batch")

	with gr.Column():
	progress_status = gr.Textbox(
	label="Progress",
	lines=2,
	show_copy_button=False
	)
	output_text_batch = gr.Textbox(
	label="Generated Captions",
	lines=25,
	show_copy_button=True
	)
	output_gallery = gr.Dataframe(
	headers=["Filename", "Caption"],
	label="Generated Captions (Table View)",
	visible=False # Hide the dataframe
	)

	batch_submit_btn.click(
	process_batch,
	inputs=[input_files, batch_description_type, batch_booru_tags],
	outputs=[output_gallery, progress_status, output_text_batch]
	)

	if __name__ == "__main__":
	# Load model at startup
	load_model()
	demo.launch(share=True)