File size: 7,090 Bytes
f9b9d56
83ee74c
574f73e
705c5b5
83ee74c
f9b9d56
ad9db85
 
 
7ddfb78
 
 
ad9db85
 
 
 
f9b9d56
2af89cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705c5b5
0997082
2af89cf
705c5b5
 
 
2af89cf
 
705c5b5
 
 
2af89cf
 
705c5b5
2af89cf
0997082
d57197f
 
 
 
 
 
 
 
ad9db85
2af89cf
705c5b5
7ffca43
2af89cf
 
 
 
 
 
7ffca43
2af89cf
7ffca43
 
 
 
 
2af89cf
 
 
7ffca43
 
 
 
 
705c5b5
 
d57197f
0997082
705c5b5
83ee74c
0997082
ad9db85
 
f2c0975
 
83ee74c
0997082
 
83ee74c
7ffca43
705c5b5
 
d57197f
 
 
0997082
7ffca43
d57197f
 
705c5b5
7b3fa19
7ffca43
7b3fa19
 
63c5e29
 
 
ad9db85
7ffca43
 
 
 
 
 
63c5e29
 
 
2af89cf
ad9db85
 
 
 
 
 
7ffca43
f86099a
2af89cf
d57197f
7ffca43
63c5e29
 
 
 
 
 
 
 
7ffca43
 
ad9db85
7ffca43
 
 
 
 
 
ad9db85
7ffca43
 
63c5e29
 
 
 
 
 
 
 
 
 
 
 
 
f9b9d56
 
63c5e29
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import gradio as gr
from huggingface_hub import InferenceClient
import os
import pandas as pd
from typing import List, Tuple

# LLM λͺ¨λΈ μ •μ˜
LLM_MODELS = {
    "Default": "CohereForAI/c4ai-command-r-plus-08-2024",  # κΈ°λ³Έ λͺ¨λΈ
    "Meta": "meta-llama/Llama-3.3-70B-Instruct",    
    "Mistral": "mistralai/Mistral-Nemo-Instruct-2407",
    "Alibaba": "Qwen/QwQ-32B-Preview"
}

def get_client(model_name):
    return InferenceClient(LLM_MODELS[model_name], token=os.getenv("HF_TOKEN"))

def analyze_file_content(content, file_type):
    """파일 λ‚΄μš©μ„ λΆ„μ„ν•˜μ—¬ 1쀄 μš”μ•½μ„ λ°˜ν™˜"""
    if file_type == 'parquet':
        return f"데이터셋 뢄석: {content.count('|')-1}개 컬럼의 데이터 ν…Œμ΄λΈ”"
    
    # ν…μŠ€νŠΈ 파일의 경우
    lines = content.split('\n')
    total_lines = len(lines)
    non_empty_lines = len([line for line in lines if line.strip()])
    
    if 'def ' in content or 'class ' in content:
        functions = len([line for line in lines if 'def ' in line])
        classes = len([line for line in lines if 'class ' in line])
        return f"μ½”λ“œ 뢄석: {total_lines}μ€„μ˜ Python μ½”λ“œ ({functions}개 ν•¨μˆ˜, {classes}개 클래슀 포함)"
    else:
        return f"ν…μŠ€νŠΈ 뢄석: {total_lines}μ€„μ˜ ν…μŠ€νŠΈ λ¬Έμ„œ (유효 λ‚΄μš© {non_empty_lines}쀄)"

def read_uploaded_file(file):
    if file is None:
        return "", ""
    try:
        if file.name.endswith('.parquet'):
            df = pd.read_parquet(file.name, engine='pyarrow')
            content = df.head(10).to_markdown(index=False)
            return content, "parquet"
        else:
            content = file.read()
            if isinstance(content, bytes):
                content = content.decode('utf-8')
            return content, "text"
    except Exception as e:
        return f"νŒŒμΌμ„ μ½λŠ” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}", "error"

def format_history(history):
    formatted_history = []
    for user_msg, assistant_msg in history:
        formatted_history.append({"role": "user", "content": user_msg})
        if assistant_msg:
            formatted_history.append({"role": "assistant", "content": assistant_msg})
    return formatted_history

def chat(message, history, uploaded_file, model_name, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
    system_prefix = """λ°˜λ“œμ‹œ ν•œκΈ€λ‘œ 닡변할것. λ„ˆλŠ” 주어진 μ†ŒμŠ€μ½”λ“œλ‚˜ 데이터λ₯Ό 기반으둜 "μ„œλΉ„μŠ€ μ‚¬μš© μ„€λͺ… 및 μ•ˆλ‚΄, Q&Aλ₯Ό ν•˜λŠ” 역할이닀". μ•„μ£Ό μΉœμ ˆν•˜κ³  μžμ„Έν•˜κ²Œ 4000토큰 이상 Markdown ν˜•μ‹μœΌλ‘œ μž‘μ„±ν•˜λΌ. λ„ˆλŠ” μž…λ ₯된 λ‚΄μš©μ„ 기반으둜 μ‚¬μš© μ„€λͺ… 및 질의 응닡을 μ§„ν–‰ν•˜λ©°, μ΄μš©μžμ—κ²Œ 도움을 μ£Όμ–΄μ•Ό ν•œλ‹€."""

    if uploaded_file:
        content, file_type = read_uploaded_file(uploaded_file)
        if file_type == "error":
            return "", history + [[message, content]]
        
        # 파일 λ‚΄μš© 뢄석 및 μš”μ•½
        file_summary = analyze_file_content(content, file_type)
        
        if file_type == 'parquet':
            system_message += f"\n\n파일 λ‚΄μš©:\n```markdown\n{content}\n```"
        else:
            system_message += f"\n\n파일 λ‚΄μš©:\n```python\n{content}\n```"
            
        if message == "파일 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€.":
            message = f"""[파일 μš”μ•½] {file_summary}

λ‹€μŒ λ‚΄μš©μ„ ν¬ν•¨ν•˜μ—¬ μƒμ„Ένžˆ μ„€λͺ…ν•˜λΌ:
1. 파일의 μ£Όμš” λͺ©μ κ³Ό κΈ°λŠ₯
2. μ£Όμš” νŠΉμ§•κ³Ό κ΅¬μ„±μš”μ†Œ
3. ν™œμš© 방법 및 μ‚¬μš© μ‹œλ‚˜λ¦¬μ˜€
4. μ£Όμ˜μ‚¬ν•­ 및 μ œν•œμ‚¬ν•­
5. κΈ°λŒ€νš¨κ³Ό 및 μž₯점"""

    messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
    messages.extend(format_history(history))
    messages.append({"role": "user", "content": message})

    response = ""
    try:
        client = get_client(model_name)
        for msg in client.chat_completion(
            messages,
            max_tokens=max_tokens,
            stream=True,
            temperature=temperature,
            top_p=top_p,
        ):
            token = msg.choices[0].delta.get('content', None)
            if token:
                response += token
        
        history = history + [[message, response]]
        return "", history
    except Exception as e:
        error_msg = f"μΆ”λ‘  쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
        history = history + [[message, error_msg]]
        return "", history

css = """
footer {visibility: hidden}
"""

with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(height=600)
            msg = gr.Textbox(
                label="λ©”μ‹œμ§€λ₯Ό μž…λ ₯ν•˜μ„Έμš”",
                show_label=False,
                placeholder="λ©”μ‹œμ§€λ₯Ό μž…λ ₯ν•˜μ„Έμš”...",
                container=False
            )
            clear = gr.ClearButton([msg, chatbot])
        
        with gr.Column(scale=1):
            model_name = gr.Radio(
                choices=list(LLM_MODELS.keys()),
                value="Default",
                label="LLM λͺ¨λΈ 선택",
                info="μ‚¬μš©ν•  LLM λͺ¨λΈμ„ μ„ νƒν•˜μ„Έμš”"
            )
            
            file_upload = gr.File(
                label="파일 μ—…λ‘œλ“œ",
                file_types=["text", ".parquet"],
                type="filepath"
            )
            
            with gr.Accordion("κ³ κΈ‰ μ„€μ •", open=False):
                system_message = gr.Textbox(label="System Message", value="")
                max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="Max Tokens")
                temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature")
                top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P")

    # 이벀트 바인딩
    msg.submit(
        chat,
        inputs=[msg, chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
        outputs=[msg, chatbot]
    )

    # 파일 μ—…λ‘œλ“œ μ‹œ μžλ™ 뢄석
    file_upload.change(
        chat,
        inputs=[gr.Textbox(value="파일 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€."), chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
        outputs=[msg, chatbot]
    )

    # 예제 μΆ”κ°€
    gr.Examples(
        examples=[
            ["μƒμ„Έν•œ μ‚¬μš© 방법을 마치 화면을 λ³΄λ©΄μ„œ μ„€λͺ…ν•˜λ“―이 4000 토큰 이상 μžμ„Ένžˆ μ„€λͺ…ν•˜λΌ"],
            ["FAQ 20건을 μƒμ„Έν•˜κ²Œ μž‘μ„±ν•˜λΌ. 4000토큰 이상 μ‚¬μš©ν•˜λΌ."],
            ["μ‚¬μš© 방법과 차별점, νŠΉμ§•, 강점을 μ€‘μ‹¬μœΌλ‘œ 4000 토큰 이상 유튜브 μ˜μƒ 슀크립트 ν˜•νƒœλ‘œ μž‘μ„±ν•˜λΌ"],
            ["λ³Έ μ„œλΉ„μŠ€λ₯Ό SEO μ΅œμ ν™”ν•˜μ—¬ λΈ”λ‘œκ·Έ 포슀트둜 4000 토큰 이상 μž‘μ„±ν•˜λΌ"],
            ["νŠΉν—ˆ μΆœμ›μ— ν™œμš©ν•  기술 및 λΉ„μ¦ˆλ‹ˆμŠ€λͺ¨λΈ 츑면을 ν¬ν•¨ν•˜μ—¬ νŠΉν—ˆ μΆœμ›μ„œ ꡬ성에 맞게 μž‘μ„±ν•˜λΌ"],
            ["계속 μ΄μ–΄μ„œ λ‹΅λ³€ν•˜λΌ"],
        ],
        inputs=msg,
    )

if __name__ == "__main__":
    demo.launch()