metadata
datasets:
- nuprl/EditPackFT-Multi
tags:
- code
What is this
This is a deepseek coder 7b model trained to predict commit messages for a diff.
Languages trained on:
LANGS = [
"Python",
"Rust",
"JavaScript",
"Java",
"Go",
"C++",
"C#",
"Ruby",
"PHP",
"TypeScript",
"C",
"Scala",
"Swift",
"Kotlin",
"Objective-C",
"Perl",
"Haskell",
"Bash",
"Sh",
"Lua",
"R",
"Julia",
]
How to prompt:
import difflib
class NDiff:
def __init__(self, s1, s2):
self.s1 = s1
self.s2 = s2
self.diff = difflib.ndiff(s1.split("\n"), s2.split("\n"))
def __str__(self):
return "\n".join([l for l in self.diff if l[0] != "?"])
def str_colored(self):
import colored
buf = ""
for l in self.diff:
if l[0] == "?":
continue
if l[0] == "-":
buf += colored.stylize(l, colored.fg("red"))
elif l[0] == "+":
buf += colored.stylize(l, colored.fg("green"))
else:
buf += l
buf += "\n"
return buf
def num_removed(self):
return len([l for l in self.diff if l[0] == "-"])
def num_added(self):
return len([l for l in self.diff if l[0] == "+"])
def __repr__(self):
return self.__str__()
def format_prompt(old, new):
diff_header = "<diff>"
instr_header = "<commit_message>"
diff = str(NDiff(old, new))
return f"{diff_header}\n{diff}\n{instr_header}\n"
def gen(old, new, max_new_tokens=200, temperature=0.45, top_p=0.90):
prompt = format_prompt(old, new)
toks = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
outs = model.generate(toks, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p)
return [tokenizer.decode(out[len(toks[0]):], skip_special_tokens=True) for out in outs]
use the "gen" function with the old and new code
Example:
- import datasets
- from pathlib import Path
from code_editing.models import CodeLlamaEditModel, LlamaChatModel, EditModel, EditCommand, ChatAdaptorEditModel, OctoCoderChatModel, codellama_edit_prompt_diff, apply_rel_diff_trim, OpenAIChatModel, StarCoderCommitEditModel
from code_editing.humanevalpack import batch_prompts_from_example
from code_editing.utils import gunzip_json_write
from typing import List, Callable
from tqdm import tqdm
# NOTE: this is the factory for each model type. to add a new model type, add a new case here
# and implement it in models.py. Also, add a new case in the argument parser below.
- def model_factory(model_type: str, quantize=False, num_gpus=1) -> Callable[[str], EditModel]:
+ def model_factory(
+ model_type: str,
+ quantize=False,
+ num_gpus=1,
+ system_supported=True,
+ ) -> Callable[[str], EditModel]:
if model_type == "codellama" or model_type == "deepseek":
return CodeLlamaEditModel
elif model_type == "starcoder":
return StarCoderCommitEditModel
elif model_type == "codellama-diff":
return (lambda path: CodeLlamaEditModel(path, prompt_format=codellama_edit_prompt_diff, post_process=apply_rel_diff_trim))
elif model_type == "openai":
return (lambda path: ChatAdaptorEditModel(OpenAIChatModel(path)))
elif model_type == "codellama-chat":
- return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus)))
+ return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus, system_supported=system_supported)))
elif model_type == "octocoder":
return (lambda path: ChatAdaptorEditModel(OctoCoderChatModel(path, quantization=quantize, num_gpus=num_gpus)))
else:
raise ValueError(f"Unknown model type: {model_type}")
def complete_problem(example: EditCommand, model: EditModel, batch_size: int, completion_limit: int, **kwargs) -> List[str]:
batches = batch_prompts_from_example(example, batch_size, completion_limit)
completions = []
for batch in batches:
resps = model.generate(batch, **kwargs)
for resp in resps:
completions.append(resp["content"])
return completions
Produced:
Add system_supported argument to model_factory