cassanof's picture
Update README.md
35a3912 verified
|
raw
history blame
4.45 kB
metadata
datasets:
  - nuprl/EditPackFT-Multi
tags:
  - code

What is this

This is a deepseek coder 7b model trained to predict commit messages for a diff.

Languages trained on:

LANGS = [
    "Python",
    "Rust",
    "JavaScript",
    "Java",
    "Go",
    "C++",
    "C#",
    "Ruby",
    "PHP",
    "TypeScript",
    "C",
    "Scala",
    "Swift",
    "Kotlin",
    "Objective-C",
    "Perl",
    "Haskell",
    "Bash",
    "Sh",
    "Lua",
    "R",
    "Julia",
]

How to prompt:

import difflib
class NDiff:
    def __init__(self, s1, s2):
        self.s1 = s1
        self.s2 = s2
        self.diff = difflib.ndiff(s1.split("\n"), s2.split("\n"))

    def __str__(self):
        return "\n".join([l for l in self.diff if l[0] != "?"])

    def str_colored(self):
        import colored

        buf = ""
        for l in self.diff:
            if l[0] == "?":
                continue
            if l[0] == "-":
                buf += colored.stylize(l, colored.fg("red"))
            elif l[0] == "+":
                buf += colored.stylize(l, colored.fg("green"))
            else:
                buf += l
            buf += "\n"
        return buf

    def num_removed(self):
        return len([l for l in self.diff if l[0] == "-"])

    def num_added(self):
        return len([l for l in self.diff if l[0] == "+"])

    def __repr__(self):
        return self.__str__()

def format_prompt(old, new):
    diff_header = "<diff>"
    instr_header = "<commit_message>"
    diff = str(NDiff(old, new))
    return f"{diff_header}\n{diff}\n{instr_header}\n"

def gen(old, new, max_new_tokens=200, temperature=0.45, top_p=0.90):
    prompt = format_prompt(old, new)
    toks = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    outs = model.generate(toks, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p)
    return [tokenizer.decode(out[len(toks[0]):], skip_special_tokens=True) for out in outs]

use the "gen" function with the old and new code

Example:

- import datasets
- from pathlib import Path
  from code_editing.models import CodeLlamaEditModel, LlamaChatModel, EditModel, EditCommand, ChatAdaptorEditModel, OctoCoderChatModel, codellama_edit_prompt_diff, apply_rel_diff_trim, OpenAIChatModel, StarCoderCommitEditModel
  from code_editing.humanevalpack import batch_prompts_from_example
  from code_editing.utils import gunzip_json_write
  from typing import List, Callable
  from tqdm import tqdm
  
  
  # NOTE: this is the factory for each model type. to add a new model type, add a new case here
  # and implement it in models.py. Also, add a new case in the argument parser below.
- def model_factory(model_type: str, quantize=False, num_gpus=1) -> Callable[[str], EditModel]:
+ def model_factory(
+         model_type: str,
+         quantize=False,
+         num_gpus=1,
+         system_supported=True,
+ ) -> Callable[[str], EditModel]:
      if model_type == "codellama" or model_type == "deepseek":
          return CodeLlamaEditModel
      elif model_type == "starcoder":
          return StarCoderCommitEditModel
      elif model_type == "codellama-diff":
          return (lambda path: CodeLlamaEditModel(path, prompt_format=codellama_edit_prompt_diff, post_process=apply_rel_diff_trim))
      elif model_type == "openai":
          return (lambda path: ChatAdaptorEditModel(OpenAIChatModel(path)))
      elif model_type == "codellama-chat":
-         return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus)))
+         return (lambda path: ChatAdaptorEditModel(LlamaChatModel(path, quantization=quantize, num_gpus=num_gpus, system_supported=system_supported)))
      elif model_type == "octocoder":
          return (lambda path: ChatAdaptorEditModel(OctoCoderChatModel(path, quantization=quantize, num_gpus=num_gpus)))
      else:
          raise ValueError(f"Unknown model type: {model_type}")
  
  def complete_problem(example: EditCommand, model: EditModel, batch_size: int, completion_limit: int, **kwargs) -> List[str]:
      batches = batch_prompts_from_example(example, batch_size, completion_limit)
  
      completions = []
      for batch in batches:
          resps = model.generate(batch, **kwargs)
          for resp in resps:
              completions.append(resp["content"])
  
      return completions

Produced:

Add system_supported argument to model_factory