cutechicken commited on
Commit
5716c43
Β·
verified Β·
1 Parent(s): 5c6c33c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -311
app.py CHANGED
@@ -241,317 +241,6 @@ def read_uploaded_file(file):
241
  except Exception as e:
242
  return f"❌ 파일 읽기 였λ₯˜: {str(e)}", "error"
243
 
244
- def chat(message, history, uploaded_file, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
245
- if not message:
246
- return "", history
247
-
248
- system_prefix = """μ €λŠ” μ—¬λŸ¬λΆ„μ˜ μΉœκ·Όν•˜κ³  지적인 AI μ–΄μ‹œμŠ€ν„΄νŠΈ 'GiniGEN'μž…λ‹ˆλ‹€.. λ‹€μŒκ³Ό 같은 μ›μΉ™μœΌλ‘œ μ†Œν†΅ν•˜κ² μŠ΅λ‹ˆλ‹€:
249
- 1. 🀝 μΉœκ·Όν•˜κ³  곡감적인 νƒœλ„λ‘œ λŒ€ν™”
250
- 2. πŸ’‘ λͺ…ν™•ν•˜κ³  μ΄ν•΄ν•˜κΈ° μ‰¬μš΄ μ„€λͺ… 제곡
251
- 3. 🎯 질문의 μ˜λ„λ₯Ό μ •ν™•νžˆ νŒŒμ•…ν•˜μ—¬ λ§žμΆ€ν˜• λ‹΅λ³€
252
- 4. πŸ“š ν•„μš”ν•œ 경우 μ—…λ‘œλ“œλœ 파일 λ‚΄μš©μ„ μ°Έκ³ ν•˜μ—¬ ꡬ체적인 도움 제곡
253
- 5. ✨ 좔가적인 톡찰과 μ œμ•ˆμ„ ν†΅ν•œ κ°€μΉ˜ μžˆλŠ” λŒ€ν™”
254
- 항상 예의 λ°”λ₯΄κ³  μΉœμ ˆν•˜κ²Œ μ‘λ‹΅ν•˜λ©°, ν•„μš”ν•œ 경우 ꡬ체적인 μ˜ˆμ‹œλ‚˜ μ„€λͺ…을 μΆ”κ°€ν•˜μ—¬
255
- 이해λ₯Ό λ•κ² μŠ΅λ‹ˆλ‹€."""
256
-
257
- try:
258
- # 첫 λ©”μ‹œμ§€μΌ λ•Œ λͺ¨λΈ λ‘œλ”©
259
- model_manager.ensure_model_loaded()
260
-
261
- if uploaded_file:
262
- content, file_type = read_uploaded_file(uploaded_file)
263
- if file_type == "error":
264
- error_message = content
265
- chat_history.add_conversation(message, error_message)
266
- return "", history + [[message, error_message]]
267
-
268
- file_summary = analyze_file_content(content, file_type)
269
-
270
- if file_type in ['parquet', 'csv']:
271
- system_message += f"\n\n파일 λ‚΄μš©:\n```markdown\n{content}\n```"
272
- else:
273
- system_message += f"\n\n파일 λ‚΄μš©:\n```\n{content}\n```"
274
-
275
- if message == "파일 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€...":
276
- message = f"""[파일 ꡬ쑰 뢄석] {file_summary}
277
- λ‹€μŒ κ΄€μ μ—μ„œ 도움을 λ“œλ¦¬κ² μŠ΅λ‹ˆλ‹€:
278
- 1. πŸ“‹ μ „λ°˜μ μΈ λ‚΄μš© νŒŒμ•…
279
- 2. πŸ’‘ μ£Όμš” νŠΉμ§• μ„€λͺ…
280
- 3. 🎯 μ‹€μš©μ μΈ ν™œμš© λ°©μ•ˆ
281
- 4. ✨ κ°œμ„  μ œμ•ˆ
282
- 5. πŸ’¬ μΆ”κ°€ μ§ˆλ¬Έμ΄λ‚˜ ν•„μš”ν•œ μ„€λͺ…"""
283
-
284
- messages = [{"role": "system", "content": system_prefix + system_message}]
285
-
286
- if history:
287
- for user_msg, assistant_msg in history:
288
- messages.append({"role": "user", "content": user_msg})
289
- messages.append({"role": "assistant", "content": assistant_msg})
290
-
291
- messages.append({"role": "user", "content": message})
292
-
293
- partial_message = ""
294
-
295
- for response in model_manager.generate_response(
296
- messages,
297
- max_tokens=max_tokens,
298
- temperature=temperature,
299
- top_p=top_p
300
- ):
301
- token = response.choices[0].delta.get('content', '')
302
- if token:
303
- partial_message += token
304
- current_history = history + [[message, partial_message]]
305
- yield "", current_history
306
-
307
- chat_history.add_conversation(message, partial_message)
308
-
309
- except Exception as e:
310
- error_msg = f"❌ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
311
- chat_history.add_conversation(message, error_msg)
312
- yield "", history + [[message, error_msg]]import os
313
- from dotenv import load_dotenv
314
- import gradio as gr
315
- import pandas as pd
316
- import json
317
- from datetime import datetime
318
- import torch
319
- from transformers import AutoModelForCausalLM, AutoTokenizer
320
- import spaces
321
- from threading import Thread
322
-
323
- # ν™˜κ²½ λ³€μˆ˜ μ„€μ •
324
- HF_TOKEN = os.getenv("HF_TOKEN")
325
- MODEL_ID = "CohereForAI/c4ai-command-r7b-12-2024"
326
-
327
- class ModelManager:
328
- def __init__(self):
329
- self.tokenizer = None
330
- self.model = None
331
- # μ΄ˆκΈ°ν™”λŠ” 첫 μš”μ²­ μ‹œμ— μˆ˜ν–‰
332
-
333
- def ensure_model_loaded(self):
334
- if self.model is None or self.tokenizer is None:
335
- self.setup_model()
336
-
337
- @spaces.GPU
338
- def setup_model(self):
339
- try:
340
- print("ν† ν¬λ‚˜μ΄μ € λ‘œλ”© μ‹œμž‘...")
341
- self.tokenizer = AutoTokenizer.from_pretrained(
342
- MODEL_ID,
343
- use_fast=True,
344
- token=HF_TOKEN,
345
- trust_remote_code=True
346
- )
347
- if not self.tokenizer.pad_token:
348
- self.tokenizer.pad_token = self.tokenizer.eos_token
349
- print("ν† ν¬λ‚˜μ΄μ € λ‘œλ”© μ™„λ£Œ")
350
-
351
- print("λͺ¨λΈ λ‘œλ”© μ‹œμž‘...")
352
- self.model = AutoModelForCausalLM.from_pretrained(
353
- MODEL_ID,
354
- token=HF_TOKEN,
355
- torch_dtype=torch.bfloat16,
356
- device_map="auto",
357
- trust_remote_code=True,
358
- low_cpu_mem_usage=True
359
- )
360
- self.model.eval()
361
- print("λͺ¨λΈ λ‘œλ”© μ™„λ£Œ")
362
-
363
- except Exception as e:
364
- print(f"λͺ¨λΈ λ‘œλ”© 쀑 였λ₯˜ λ°œμƒ: {e}")
365
- raise Exception(f"λͺ¨λΈ λ‘œλ”© μ‹€νŒ¨: {e}")
366
-
367
- @spaces.GPU
368
- def generate_response(self, messages, max_tokens=4000, temperature=0.7, top_p=0.9):
369
- try:
370
- # λͺ¨λΈμ΄ λ‘œλ“œλ˜μ–΄ μžˆλŠ”μ§€ 확인
371
- self.ensure_model_loaded()
372
-
373
- # μž…λ ₯ ν…μŠ€νŠΈ μ€€λΉ„
374
- prompt = ""
375
- for msg in messages:
376
- role = msg["role"]
377
- content = msg["content"]
378
- if role == "system":
379
- prompt += f"System: {content}\n"
380
- elif role == "user":
381
- prompt += f"Human: {content}\n"
382
- elif role == "assistant":
383
- prompt += f"Assistant: {content}\n"
384
- prompt += "Assistant: "
385
-
386
- # μž…λ ₯ 인코딩
387
- input_ids = self.tokenizer.encode(
388
- prompt,
389
- return_tensors="pt",
390
- add_special_tokens=True
391
- ).to(self.model.device)
392
-
393
- # 응닡 생성
394
- with torch.no_grad():
395
- output_ids = self.model.generate(
396
- input_ids,
397
- max_new_tokens=max_tokens,
398
- do_sample=True,
399
- temperature=temperature,
400
- top_p=top_p,
401
- pad_token_id=self.tokenizer.pad_token_id,
402
- eos_token_id=self.tokenizer.eos_token_id,
403
- num_return_sequences=1
404
- )
405
-
406
- # 응닡 λ””μ½”λ”©
407
- generated_text = self.tokenizer.decode(
408
- output_ids[0][input_ids.shape[1]:],
409
- skip_special_tokens=True
410
- )
411
-
412
- # 단어 λ‹¨μœ„λ‘œ 슀트리밍
413
- words = generated_text.split()
414
- for word in words:
415
- yield type('Response', (), {
416
- 'choices': [type('Choice', (), {
417
- 'delta': {'content': word + " "}
418
- })()]
419
- })()
420
-
421
- except Exception as e:
422
- print(f"응닡 생성 쀑 였λ₯˜ λ°œμƒ: {e}")
423
- raise Exception(f"응닡 생성 μ‹€νŒ¨: {e}")
424
-
425
- class ChatHistory:
426
- def __init__(self):
427
- self.history = []
428
- self.history_file = "/tmp/chat_history.json"
429
- self.load_history()
430
-
431
- def add_conversation(self, user_msg: str, assistant_msg: str):
432
- conversation = {
433
- "timestamp": datetime.now().isoformat(),
434
- "messages": [
435
- {"role": "user", "content": user_msg},
436
- {"role": "assistant", "content": assistant_msg}
437
- ]
438
- }
439
- self.history.append(conversation)
440
- self.save_history()
441
-
442
- def format_for_display(self):
443
- formatted = []
444
- for conv in self.history:
445
- formatted.append([
446
- conv["messages"][0]["content"],
447
- conv["messages"][1]["content"]
448
- ])
449
- return formatted
450
-
451
- def get_messages_for_api(self):
452
- messages = []
453
- for conv in self.history:
454
- messages.extend([
455
- {"role": "user", "content": conv["messages"][0]["content"]},
456
- {"role": "assistant", "content": conv["messages"][1]["content"]}
457
- ])
458
- return messages
459
-
460
- def clear_history(self):
461
- self.history = []
462
- self.save_history()
463
-
464
- def save_history(self):
465
- try:
466
- with open(self.history_file, 'w', encoding='utf-8') as f:
467
- json.dump(self.history, f, ensure_ascii=False, indent=2)
468
- except Exception as e:
469
- print(f"νžˆμŠ€ν† λ¦¬ μ €μž₯ μ‹€νŒ¨: {e}")
470
-
471
- def load_history(self):
472
- try:
473
- if os.path.exists(self.history_file):
474
- with open(self.history_file, 'r', encoding='utf-8') as f:
475
- self.history = json.load(f)
476
- except Exception as e:
477
- print(f"νžˆμŠ€ν† λ¦¬ λ‘œλ“œ μ‹€νŒ¨: {e}")
478
- self.history = []
479
-
480
- # μ „μ—­ μΈμŠ€ν„΄μŠ€ 생성
481
- chat_history = ChatHistory()
482
- model_manager = ModelManager()
483
-
484
- def analyze_file_content(content, file_type):
485
- """Analyze file content and return structural summary"""
486
- if file_type in ['parquet', 'csv']:
487
- try:
488
- lines = content.split('\n')
489
- header = lines[0]
490
- columns = header.count('|') - 1
491
- rows = len(lines) - 3
492
- return f"πŸ“Š 데이터셋 ꡬ쑰: {columns}개 컬럼, {rows}개 데이터"
493
- except:
494
- return "❌ 데이터셋 ꡬ쑰 뢄석 μ‹€νŒ¨"
495
-
496
- lines = content.split('\n')
497
- total_lines = len(lines)
498
- non_empty_lines = len([line for line in lines if line.strip()])
499
-
500
- if any(keyword in content.lower() for keyword in ['def ', 'class ', 'import ', 'function']):
501
- functions = len([line for line in lines if 'def ' in line])
502
- classes = len([line for line in lines if 'class ' in line])
503
- imports = len([line for line in lines if 'import ' in line or 'from ' in line])
504
- return f"πŸ’» μ½”λ“œ ꡬ쑰: {total_lines}쀄 (ν•¨μˆ˜: {functions}, 클래슀: {classes}, μž„ν¬νŠΈ: {imports})"
505
-
506
- paragraphs = content.count('\n\n') + 1
507
- words = len(content.split())
508
- return f"πŸ“ λ¬Έμ„œ ꡬ쑰: {total_lines}쀄, {paragraphs}단락, μ•½ {words}단어"
509
-
510
- def read_uploaded_file(file):
511
- if file is None:
512
- return "", ""
513
- try:
514
- file_ext = os.path.splitext(file.name)[1].lower()
515
-
516
- if file_ext == '.parquet':
517
- df = pd.read_parquet(file.name, engine='pyarrow')
518
- content = df.head(10).to_markdown(index=False)
519
- return content, "parquet"
520
- elif file_ext == '.csv':
521
- encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
522
- for encoding in encodings:
523
- try:
524
- df = pd.read_csv(file.name, encoding=encoding)
525
- content = f"πŸ“Š 데이터 미리보기:\n{df.head(10).to_markdown(index=False)}\n\n"
526
- content += f"\nπŸ“ˆ 데이터 정보:\n"
527
- content += f"- 전체 ν–‰ 수: {len(df)}\n"
528
- content += f"- 전체 μ—΄ 수: {len(df.columns)}\n"
529
- content += f"- 컬럼 λͺ©λ‘: {', '.join(df.columns)}\n"
530
- content += f"\nπŸ“‹ 컬럼 데이터 νƒ€μž…:\n"
531
- for col, dtype in df.dtypes.items():
532
- content += f"- {col}: {dtype}\n"
533
- null_counts = df.isnull().sum()
534
- if null_counts.any():
535
- content += f"\n⚠️ 결츑치:\n"
536
- for col, null_count in null_counts[null_counts > 0].items():
537
- content += f"- {col}: {null_count}개 λˆ„λ½\n"
538
- return content, "csv"
539
- except UnicodeDecodeError:
540
- continue
541
- raise UnicodeDecodeError(f"❌ μ§€μ›λ˜λŠ” μΈμ½”λ”©μœΌλ‘œ νŒŒμΌμ„ 읽을 수 μ—†μŠ΅λ‹ˆλ‹€ ({', '.join(encodings)})")
542
- else:
543
- encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
544
- for encoding in encodings:
545
- try:
546
- with open(file.name, 'r', encoding=encoding) as f:
547
- content = f.read()
548
- return content, "text"
549
- except UnicodeDecodeError:
550
- continue
551
- raise UnicodeDecodeError(f"❌ μ§€μ›λ˜λŠ” μΈμ½”λ”©μœΌλ‘œ νŒŒμΌμ„ 읽을 수 μ—†μŠ΅λ‹ˆλ‹€ ({', '.join(encodings)})")
552
- except Exception as e:
553
- return f"❌ 파일 읽기 였λ₯˜: {str(e)}", "error"
554
-
555
  def chat(message, history, uploaded_file, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
556
  if not message:
557
  return "", history
 
241
  except Exception as e:
242
  return f"❌ 파일 읽기 였λ₯˜: {str(e)}", "error"
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  def chat(message, history, uploaded_file, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
245
  if not message:
246
  return "", history