Spaces:

st0bb3n
/

Cam2Speech

Runtime error

App Files Files Community

Cam2Speech / app.py

st0bb3n

Update app.py

147cf50 over 2 years ago

raw

history blame contribute delete

1.94 kB

	from transformers import ViTFeatureExtractor, ViTForImageClassification
	import gradio as gr
	from datasets import load_dataset
	import torch


	#dataset = load_dataset("cifar100")
	#image = dataset["train"]["fine_label"]
	#print("load and train dataset \n")

	feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
	print("feature extractor \n")
	model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
	print("load model \n")

	def classify(image):
	inputs = feature_extractor(images=image, return_tensors="pt")
	print("define input \n")
	with torch.no_grad():
	outputs = model(**inputs)
	logits = outputs.logits
	# model predicts one of the 1000 ImageNet classes
	print("prediction \n")
	predicted_class_idx = logits.argmax(-1).item()
	return model.config.id2label[predicted_class_idx]

	def image2speech(image):
	print("tts \n")
	try:
	txt = classify(image)
	except:
	txt = "No object detected"

	return fastspeech(txt), txt

	print("load tts interface \n")
	fastspeech = gr.Interface.load("huggingface/facebook/fastspeech2-en-ljspeech")

	print("sets input and outputs \n")
	camera = gr.inputs.Image(label="Image from your camera", source="webcam")
	read = gr.outputs.Textbox(type="auto", label="Text")
	speak = gr.outputs.Audio(type="auto", label="Speech")

	print("define interface \n")
	app = gr.Interface(fn=image2speech,
	inputs=camera,
	live=True,
	description="Takes a snapshot of an object, identifies it, and then tell you what it is. \n Intended use is to help the visually impaired. Models and dataset used is listed on the linked models and dataset",
	outputs=[speak, read],
	examples=["remotecontrol.jpg", "calculator.jpg", "cellphone.jpg"])

	print("launch interface \n")
	app.launch(cache_examples=True)