Kanpredict
/

gptj-6b-8bits

Text Generation

Transformers

PyTorch

gptj

Inference Endpoints

Model card Files Files and versions Community

OssamaLafhel commited on Apr 5, 2023

Commit

35d9624

1 Parent(s): fd28244

Update handler.py

Browse files

Files changed (1) hide show

handler.py +52 -14

handler.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import transformers
 from transformers import pipeline
 import torch
@@ -5,6 +10,7 @@ from torch import nn
 import torch.nn.functional as F
 from torch.cuda.amp import custom_fwd, custom_bwd
 from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise
 from typing import Dict, List, Any
@@ -153,30 +159,62 @@ class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
 transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock  # monkey-patch GPT-J
 # -----------------------------------------> API <---------------------------------------
 class EndpointHandler:
     def __init__(self, path=""):
         # load the model
-        tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
-        model = GPTJForCausalLM.from_pretrained(path, low_cpu_mem_usage=True)
-        # check for GPU
-        device = 0 if torch.cuda.is_available() else -1
         model.to(device)
         # create inference pipeline
-        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", None)
-        # pass inputs with all kwargs in data
-        if parameters is not None:
-            prediction = self.pipeline(inputs, **parameters)
-        else:
-            prediction = self.pipeline(inputs)
-        # postprocess the prediction
-        return prediction

+import time
+import json
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
 import transformers
 from transformers import pipeline
 import torch
 import torch.nn.functional as F
 from torch.cuda.amp import custom_fwd, custom_bwd
 from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise
+from loguru import logger
 from typing import Dict, List, Any
 transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock  # monkey-patch GPT-J
+class Message(BaseModel):
+    input: str = None
+    output: dict = None
+    length: str = None
+    temperature: str = None
+app = FastAPI()
+origins = [
+    "http://localhost:8000",
+    "http://localhost",
+    "http://localhost:3000",
+    "http://127.0.0.1:3000"
+]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["POST"],
+    allow_headers=["*"],
+)
 # -----------------------------------------> API <---------------------------------------
+tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+model = GPTJForCausalLM.from_pretrained("Kanpredict/gptj-6b-8bits", low_cpu_mem_usage=True)
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
 class EndpointHandler:
     def __init__(self, path=""):
         # load the model
         model.to(device)
         # create inference pipeline
+        self.pipeline = pipeline(model=model, tokenizer=tokenizer, device=device)
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", None)
+        # run the model and get the output(generated text)
+        prompt = inputs
+        temperature = float(parameters.temperature)
+        length = int(parameters.length)
+        logger.info("message input: %s", prompt)
+        logger.info("tempereture: %s", parameters.temperature)
+        logger.info("length: %s", parameters.length)
+        start = time.time()
+        prompt = tokenizer(prompt, return_tensors='pt')
+        prompt = {key: value.to(device) for key, value in prompt.items()}
+        out = model.generate(**prompt, min_length=length, max_length=length, temperature=temperature, do_sample=True)
+        generated_text = tokenizer.decode(out[0])
+        logger.info("generated text: ", generated_text)
+        logger.info("time taken: %s", time.time() - start)
+        result = {"output": generated_text}
+        result = json.dumps(result)
+        return result