import gradio as gr import json import os # Load source_connectors and destination_connectors from JSON files with open('source_connectors.json', 'r') as f: source_connectors = json.load(f) with open('destination_connectors.json', 'r') as f: destination_connectors = json.load(f) def generate_documentation_link(source, destination): return f"[{source['source_connector']} source connector documentation]({source['docs']}) | [{destination['destination_connector']} destination connector documentation]({destination['docs']})" def generate_code(source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding): source_connector = source_connectors[source] destination_connector = destination_connectors[destination] # Ensure proper indentation for source and destination configs indented_source_configs = '\n'.join( ' ' + line for line in source_connector['configs'].strip().split('\n')) indented_destination_configs = '\n'.join( ' ' + line for line in destination_connector['configs'].strip().split('\n')) # Generate chunking configuration chunking_config = '\n # Chunking step skipped\n' if chunking_strategy != "None": chunking_config = f''' chunker_config=ChunkerConfig( chunking_strategy="{chunking_strategy}", chunk_max_characters={chunk_size if chunk_size is not None else 1000}, chunk_overlap={chunk_overlap if chunk_overlap is not None else 20} ),''' # Generate embedding configuration embedding_config = ' # Embedding step is skipped' if embedding != "None": if embedding == "langchain-huggingface": embedding_config = f''' embedder_config=EmbedderConfig( embedding_provider="{embedding}", embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"), ),''' elif embedding == "langchain-aws-bedrock": embedding_config = f''' embedder_config=EmbedderConfig( embedding_provider="{embedding}", embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"), embedding_aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), embedding_aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"), ),''' else: embedding_config = f''' embedder_config=EmbedderConfig( embedding_provider="{embedding}", embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"), embedding_api_key=os.getenv("EMBEDDING_PROVIDER_API_KEY"), ),''' code = f''' import os from unstructured_ingest.v2.pipeline.pipeline import Pipeline from unstructured_ingest.v2.interfaces import ProcessorConfig from unstructured_ingest.v2.processes.partitioner import PartitionerConfig {source_connector['imports']} {destination_connector['imports']} from unstructured_ingest.v2.processes.chunker import ChunkerConfig from unstructured_ingest.v2.processes.embedder import EmbedderConfig if __name__ == "__main__": Pipeline.from_configs( context=ProcessorConfig(), {indented_source_configs} partitioner_config=PartitionerConfig( partition_by_api=True, api_key=os.getenv("UNSTRUCTURED_API_KEY"), partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), strategy="hi_res", ),{chunking_config} {embedding_config} {indented_destination_configs} ).run() ''' doc_link = generate_documentation_link(source_connector, destination_connector) return code, doc_link with gr.Blocks() as demo: gr.Markdown("# Unstructured-Ingest Code Generator") gr.Markdown("Generate code for the unstructured-ingest library based on your inputs. Learn more about using Unstructured Serverless API in the [documentation](https://docs.unstructured.io/api-reference/ingest/overview).") with gr.Row(): with gr.Column(scale=1): source = gr.Dropdown(list(source_connectors.keys()), label="Get unstructured documents from:", value="S3") destination = gr.Dropdown(list(destination_connectors.keys()), label="Upload RAG-ready documents to:", value="Local directory") chunking_strategy = gr.Dropdown(["None", "by_title", "basic", "by_page", "by_similarity"], label="Chunking strategy:", value="None") chunk_size = gr.Number(value=1000, label="Chunk size (characters):", step=1) chunk_overlap = gr.Number(value=20, label="Chunk overlap (characters):", step=1) embedding = gr.Dropdown(["None", "langchain-openai", "langchain-huggingface", "langchain-aws-bedrock", "langchain-vertexai", "langchain-voyageai", "octoai"], label="Embedding provider:") submit_button = gr.Button("Generate Code") with gr.Column(scale=2): output_code = gr.Code(language="python", label="Generated Code") output_docs = gr.Markdown(label="Documentation Links") submit_button.click( fn=generate_code, inputs=[source, destination, chunking_strategy, chunk_size, chunk_overlap, embedding], outputs=[output_code, output_docs] ) demo.launch()