Spaces:
Running
on
Zero
Running
on
Zero
cutechicken
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -36,16 +36,15 @@ class ModelManager:
|
|
36 |
print("ํ ํฌ๋์ด์ ๋ก๋ฉ ์๋ฃ")
|
37 |
|
38 |
print("๋ชจ๋ธ ๋ก๋ฉ ์์...")
|
|
|
39 |
self.model = AutoModelForCausalLM.from_pretrained(
|
40 |
MODEL_ID,
|
41 |
token=HF_TOKEN,
|
42 |
torch_dtype=torch.float16,
|
43 |
-
device_map=
|
44 |
trust_remote_code=True,
|
45 |
-
low_cpu_mem_usage=True
|
46 |
-
max_memory={0: "13GB"} # GPU ๋ฉ๋ชจ๋ฆฌ ์ ํ
|
47 |
)
|
48 |
-
self.model.eval()
|
49 |
print("๋ชจ๋ธ ๋ก๋ฉ ์๋ฃ")
|
50 |
|
51 |
except Exception as e:
|
@@ -55,11 +54,13 @@ class ModelManager:
|
|
55 |
@spaces.GPU
|
56 |
def generate_text(self, prompt, max_tokens, temperature, top_p):
|
57 |
try:
|
|
|
|
|
58 |
input_ids = self.tokenizer.encode(
|
59 |
prompt,
|
60 |
return_tensors="pt",
|
61 |
add_special_tokens=True
|
62 |
-
).to(
|
63 |
|
64 |
with torch.no_grad():
|
65 |
output_ids = self.model.generate(
|
@@ -73,11 +74,15 @@ class ModelManager:
|
|
73 |
num_return_sequences=1
|
74 |
)
|
75 |
|
|
|
|
|
76 |
return self.tokenizer.decode(
|
77 |
output_ids[0][input_ids.shape[1]:],
|
78 |
skip_special_tokens=True
|
79 |
)
|
80 |
except Exception as e:
|
|
|
|
|
81 |
raise Exception(f"ํ
์คํธ ์์ฑ ์คํจ: {e}")
|
82 |
|
83 |
def generate_response(self, messages, max_tokens=4000, temperature=0.7, top_p=0.9):
|
|
|
36 |
print("ํ ํฌ๋์ด์ ๋ก๋ฉ ์๋ฃ")
|
37 |
|
38 |
print("๋ชจ๋ธ ๋ก๋ฉ ์์...")
|
39 |
+
# CUDA ์ด๊ธฐํ ๋ฐฉ์ง๋ฅผ ์ํ ์ค์
|
40 |
self.model = AutoModelForCausalLM.from_pretrained(
|
41 |
MODEL_ID,
|
42 |
token=HF_TOKEN,
|
43 |
torch_dtype=torch.float16,
|
44 |
+
device_map=None, # ์ด๊ธฐ์๋ device_map์ ์ค์ ํ์ง ์์
|
45 |
trust_remote_code=True,
|
46 |
+
low_cpu_mem_usage=True
|
|
|
47 |
)
|
|
|
48 |
print("๋ชจ๋ธ ๋ก๋ฉ ์๋ฃ")
|
49 |
|
50 |
except Exception as e:
|
|
|
54 |
@spaces.GPU
|
55 |
def generate_text(self, prompt, max_tokens, temperature, top_p):
|
56 |
try:
|
57 |
+
# GPU ์ปจํ
์คํธ ๋ด์์ device ์ค์
|
58 |
+
self.model = self.model.to("cuda")
|
59 |
input_ids = self.tokenizer.encode(
|
60 |
prompt,
|
61 |
return_tensors="pt",
|
62 |
add_special_tokens=True
|
63 |
+
).to("cuda")
|
64 |
|
65 |
with torch.no_grad():
|
66 |
output_ids = self.model.generate(
|
|
|
74 |
num_return_sequences=1
|
75 |
)
|
76 |
|
77 |
+
# CPU๋ก ๋ค์ ์ด๋
|
78 |
+
self.model = self.model.to("cpu")
|
79 |
return self.tokenizer.decode(
|
80 |
output_ids[0][input_ids.shape[1]:],
|
81 |
skip_special_tokens=True
|
82 |
)
|
83 |
except Exception as e:
|
84 |
+
if self.model.device.type == "cuda":
|
85 |
+
self.model = self.model.to("cpu")
|
86 |
raise Exception(f"ํ
์คํธ ์์ฑ ์คํจ: {e}")
|
87 |
|
88 |
def generate_response(self, messages, max_tokens=4000, temperature=0.7, top_p=0.9):
|