Spaces:

dad1909
/

CyberCode

Paused

App Files Files Community

dad1909 commited on Aug 19, 2024

Commit

55fe000

verified ·

1 Parent(s): 493a1a4

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -79

app.py CHANGED Viewed

@@ -1,90 +1,158 @@
 import os
-import gradio as gr
 import torch
-from transformers import TextStreamer, AutoModelForCausalLM, AutoTokenizer
-import spaces
-# Define the model configurations
-model_configs = {
-    "CyberSentinel": {
-        "model_name": "dad1909/cybersentinal-2.0",
-        "max_seq_length": 1028,
-        "dtype": torch.float16,
-        "load_in_4bit": True
-    }
-}
-# Hugging Face token
 hf_token = os.getenv("HF_TOKEN")
-# Load the model when the application starts
-loaded_models = {}
-def load_model(selected_model):
-    if selected_model not in loaded_models:
-        config = model_configs[selected_model]
-        model = AutoModelForCausalLM.from_pretrained(
-            config["model_name"],
-            torch_dtype=config["dtype"],
-            device_map="auto",
-            use_auth_token=hf_token
-        )
-        tokenizer = AutoTokenizer.from_pretrained(
-            config["model_name"],
-            use_auth_token=hf_token
-        )
-        loaded_models[selected_model] = (model, tokenizer)
-    return loaded_models[selected_model]
-alpaca_prompts = {
-    "information": "Give me information about the following topic: {}",
-    "vulnerable": """Identify the line of code that is vulnerable and describe the type of software vulnerability.
 ### Code Snippet:
 {}
-### Vulnerability Description:""",
-    "Chat": "{}"
 }
-@spaces.GPU(duration=100)
-def predict(selected_model, prompt, prompt_type, max_length=128):
-    model, tokenizer = load_model(selected_model)
-    selected_prompt = alpaca_prompts[prompt_type]
-    formatted_prompt = selected_prompt.format(prompt)
-    inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")
-    text_streamer = TextStreamer(tokenizer)
-    output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=max_length)
-    return tokenizer.decode(output[0], skip_special_tokens=True)
-theme = gr.themes.Default(
-    primary_hue=gr.themes.colors.rose,
-    secondary_hue=gr.themes.colors.blue,
-    font=gr.themes.GoogleFont("Source Sans Pro")
 )
-load_model("CyberSentinel")
-with gr.Blocks(theme=theme) as demo:
-    selected_model = gr.Dropdown(choices=list(model_configs.keys()), value="CyberSentinel", label="Model")
-    prompt = gr.Textbox(lines=5, placeholder="Enter your code snippet or topic here...", label="Prompt")
-    prompt_type = gr.Dropdown(choices=list(alpaca_prompts.keys()), value="Chat", label="Prompt Type")
-    max_length = gr.Slider(minimum=128, maximum=512, step=128, value=128, label="Max Length")
-    generated_text = gr.Textbox(label="Generated Text")
-    generate_button = gr.Button("Generate")
-    generate_button.click(predict, inputs=[selected_model, prompt, prompt_type, max_length], outputs=generated_text)
-    gr.Examples(
-        examples=[
-            ["CyberSentinel", "What is SQL injection?", "information", 128],
-            ["CyberSentinel", "$buff = 'A' x 10000;\nopen(myfile, '>>PASS.PK2');\nprint myfile $buff;\nclose(myfile);", "vulnerable", 128],
-            ["CyberSentinel", "Can you tell me a joke?", "Chat", 128]
-        ],
-        inputs=[selected_model, prompt, prompt_type, max_length]
-    )
-demo.queue(default_concurrency_limit=20).launch(
-    server_name="0.0.0.0",
-    allowed_paths=["/"],
-    share=True
-)

 import os
 import torch
+from unsloth import FastLanguageModel, is_bfloat16_supported
+from trl import SFTTrainer
+from transformers import TrainingArguments
+from datasets import load_dataset
+import gradio as gr
+import json
+from huggingface_hub import HfApi
+max_seq_length = 4096
+dtype = None
+load_in_4bit = True
 hf_token = os.getenv("HF_TOKEN")
+current_num = os.getenv("NUM")
+print(f"stage ${current_num}")
+api = HfApi(token=hf_token)
+# models = f"dad1909/cybersentinal-2.0-{current_num}"
+model_base = "unsloth/gemma-2-27b-bnb-4bit"
+print("Starting model and tokenizer loading...")
+# Load the model and tokenizer
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=model_base,
+    max_seq_length=max_seq_length,
+    dtype=dtype,
+    load_in_4bit=load_in_4bit,
+    token=hf_token
+)
+print("Model and tokenizer loaded successfully.")
+print("Configuring PEFT model...")
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=16,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+    lora_alpha=16,
+    lora_dropout=0,
+    bias="none",
+    use_gradient_checkpointing="unsloth",
+    random_state=3407,
+    use_rslora=False,
+    loftq_config=None,
+)
+print("PEFT model configured.")
+# Updated alpaca_prompt for different types
+alpaca_prompt = {
+    "learning_from": """Below is a CVE definition.
+### CVE definition:
+{}
+### detail CVE:
+{}""",
+    "definition": """Below is a definition about software vulnerability. Explain it.
+### Definition:
+{}
+### Explanation:
+{}""",
+    "code_vulnerability": """Below is a code snippet. Identify the line of code that is vulnerable and describe the type of software vulnerability.
 ### Code Snippet:
 {}
+### Vulnerability solution:
+{}"""
 }
+EOS_TOKEN = tokenizer.eos_token
+def detect_prompt_type(instruction):
+    if instruction.startswith("what is code vulnerable of this code:"):
+        return "code_vulnerability"
+    elif instruction.startswith("Learning from"):
+        return "learning_from"
+    elif instruction.startswith("what is"):
+        return "definition"
+    else:
+        return "unknown"
+def formatting_prompts_func(examples):
+    instructions = examples["instruction"]
+    outputs = examples["output"]
+    texts = []
+    for instruction, output in zip(instructions, outputs):
+        prompt_type = detect_prompt_type(instruction)
+        if prompt_type in alpaca_prompt:
+            prompt = alpaca_prompt[prompt_type].format(instruction, output)
+        else:
+            prompt = instruction + "\n\n" + output
+        text = prompt + EOS_TOKEN
+        texts.append(text)
+    return {"text": texts}
+print("Loading dataset...")
+dataset = load_dataset("dad1909/DCSV", split="train")
+print("Dataset loaded successfully.")
+print("Applying formatting function to the dataset...")
+dataset = dataset.map(formatting_prompts_func, batched=True)
+print("Formatting function applied.")
+print("Initializing trainer...")
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=max_seq_length,
+    dataset_num_proc=2,
+    packing=False,
+    args=TrainingArguments(
+        per_device_train_batch_size=1,
+        gradient_accumulation_steps=1,
+        learning_rate=2e-4,
+        fp16=not is_bfloat16_supported(),
+        bf16=is_bfloat16_supported(),
+        warmup_steps=5,
+        logging_steps=10,
+        max_steps=100,
+        optim="adamw_8bit",
+        weight_decay=0.01,
+        lr_scheduler_type="linear",
+        seed=3407,
+        output_dir="outputs"
+    ),
+)
+print("Trainer initialized.")
+print("Starting training...")
+trainer_stats = trainer.train()
+print("Training completed.")
+num = int(current_num)
+num += 1
+uploads_models = f"cybersentinal-2.0-{str(num)}"
+up = "sentinal-3.1-70B"
+print("Saving the trained model...")
+model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
+print("Model saved successfully.")
+print("Pushing the model to the hub...")
+model.push_to_hub_merged(
+    up,
+    tokenizer,
+    save_method="merged_16bit",
+    token=hf_token
 )
+print("Model pushed to hub successfully.")
+api.delete_space_variable(repo_id="dad1909/CyberCode", key="NUM")
+api.add_space_variable(repo_id="dad1909/CyberCode", key="NUM", value=str(num))