Spaces:

BoltzmannEntropy
/

QuantumLLMInstruct

Running

QuantumLLMInstruct / Q_app_llm_pairs_gen.py

BoltzmannEntropy

5ebbc77 11 days ago

7.45 kB

	"""
	Quantum Physics Problem Generator
	Shlomo Kashani

	Description:
	------------
	This module is part of the QuantumLLMInstruct system, designed to generate and solve quantum physics problems
	using advanced Large Language Models (LLMs). It utilizes a multi-stage pipeline for problem generation,
	solution generation, and database management.

	Core Functionalities:
	---------------------
	1. Problem Generation:
	- Generates quantum physics problems in LaTeX format using LLMs.
	- Supports domain-specific problem generation across multiple quantum fields.

	2. Solution Generation:
	- Provides step-by-step LaTeX solutions for the generated problems using a second LLM.

	3. Data Management:
	- Stores generated problems and solutions in DuckDB and Parquet files.
	- Enables exporting data in Parquet format for scalability and compatibility.

	4. Gradio Interface:
	- A user-friendly interface to interact with the system, including problem generation,
	solution generation, and database exploration.

	5. Hugging Face Integration:
	- Supports visualization and interaction with the dataset on the Hugging Face platform.

	Main Components:
	----------------
	- initialize_duckdb() / initialize_parquet(): Initializes the database schema.
	- generate_multiple_problems(): Generates multiple problems for the selected quantum domains.
	- generate_solutions(): Solves unsolved problems in the database.
	- export_parquet(): Exports the database to a Parquet file for external use.

	Dependencies:
	-------------
	- Python 3.7+
	- Transformers: `transformers`
	- DuckDB: `duckdb`
	- Gradio: `gradio`
	- Pandas: `pandas`
	"""
	import gradio as gr

	from Q_llm_prompts import *
	from Q_quantum_utils import *

	initialize_duckdb()

	description = """
	This demo showcases [QuantumLLMInstruct](https://huggingface.co/datasets/BoltzmannEntropy/QuantumLLMInstruct/)
	<img src="https://huggingface.co/datasets/BoltzmannEntropy/QuantumLLMInstruct/resolve/main/qlmmi-detailed-flowchart.jpg" alt="The Pipeline" width="70%" align="center" />
	## 🚀 Pipeline:
	1. 📝 Problem Generation: The Qwen model generates a user instruction.
	2. 💬 Solution Generation: The Qwen model generates a response to this instruction.
	"""

	# Gradio app
	with gr.Blocks() as app:
	"""
	Main Gradio application block defining the QuantumLLMInstruct pipeline.
	Provides tabs for viewing datasets, generating problems, generating solutions,
	and exporting the database.
	"""
	gr.Markdown("# QuantumLLMInstruct: A 500k LLM Instruction-Tuning Dataset with Problem-Solution Pairs for Quantum Computing.")

	with gr.Tab("View HF DB"):
	"""
	Tab for displaying the Hugging Face QuantumLLMInstruct dataset.
	Embeds a viewer for exploring the dataset hosted on Hugging Face.
	"""
	gr.Markdown("### Generated Dataset")
	gr.HTML("""<iframe
	src="https://huggingface.co/datasets/BoltzmannEntropy/QuantumLLMInstruct/embed/viewer"
	frameborder="0"
	width="100%"
	height="560px"
	></iframe>""")

	with gr.Tab("LLM stage 1 model: Generate Problems"):
	"""
	Tab for generating quantum physics problems using the Qwen model.
	Allows users to select models, choose domains, and specify the number of problems to generate.
	"""
	model_selector = gr.Dropdown(
	choices=model_options,
	value=model_options[0],
	label="Select Qwen Model"
	)
	reload_button = gr.Button("Reload Model")
	reload_status = gr.Textbox(label="Model Status", interactive=False)

	generate_button = gr.Button("🚀 Generate Instructions For the Pair")
	result_output = gr.Textbox(label="Generated Problems", interactive=False)
	num_pairs = gr.Radio(choices=[1, 5, 50, 200, 2000, 20000, 200000], value=5, label="Number of Problems")

	domain_selector = gr.CheckboxGroup(
	choices=list(quantum_problem_domains.keys()),
	value=list(quantum_problem_domains.keys()),
	label="Select Domain Types"
	)

	reload_button.click(
	reload_model,
	inputs=[model_selector],
	outputs=[reload_status]
	)

	def generate_and_display(num_pairs, selected_domains):
	"""
	Generates multiple quantum problems based on user inputs.

	Args:
	num_pairs (int): Number of problems to generate.
	selected_domains (list): Selected quantum domains for problem generation.

	Returns:
	str: Status message confirming successful problem generation.
	"""
	generate_multiple_problems(num_pairs, selected_domains)
	return "Problems generated successfully."

	generate_button.click(
	generate_and_display,
	inputs=[num_pairs, domain_selector],
	outputs=[result_output]
	)

	with gr.Tab("LLM Stage 2: Generate Solutions"):
	"""
	Tab for generating solutions to the quantum problems using the Qwen solution models.
	"""
	generate_solutions_button = gr.Button("🚀 Generate Responses for the Pair")
	solutions_status = gr.Textbox(label="Solution Generation Status", interactive=False)

	solutions_model_selector = gr.Dropdown(
	choices=solutions_model_options,
	value=solutions_model_options[4],
	label="Select Solution Model"
	)

	generate_solutions_button.click(
	generate_solutions,
	inputs=[solutions_model_selector],
	outputs=[solutions_status]
	)

	with gr.Tab("View instruction-pairs DB"):
	"""
	Tab for viewing data stored in the DuckDB database.
	Provides an HTML table viewer for summarized and detailed data exploration.
	"""
	summary_output = gr.HTML()
	view_button = gr.Button("View Data")
	db_output_display = gr.HTML()

	view_button.click(load_summary_from_duckdb, inputs=None, outputs=summary_output)

	def view_db_data():
	"""
	Loads and formats problems stored in the DuckDB database for display.

	Returns:
	str: HTML representation of the database content or a message indicating no data.
	"""
	df = load_problems_from_duckdb()
	if df.empty:
	return "<p>No data found in the DuckDB database.</p>"
	html_table = df.to_html(index=False, escape=False)
	return html_table

	view_button.click(
	view_db_data,
	inputs=None,
	outputs=[db_output_display]
	)

	with gr.Tab("Export Parquet"):
	"""
	Tab for exporting the DuckDB database into a Parquet file format.
	Enables efficient data sharing and downstream analysis.
	"""
	gr.Markdown("### Export DuckDB Data to Parquet Format")
	db_file_input = gr.Textbox(label="Database File Path", value="quantum_problems.duckdb")
	export_button = gr.Button("Export Data")
	export_status = gr.Textbox(label="Export Status", interactive=False)

	export_button.click(
	export_parquet,
	inputs=[db_file_input],
	outputs=[export_status]
	)

	gr.Markdown(description)

	app.launch()