openfree commited on
Commit
1fd4ab2
β€’
1 Parent(s): 521288b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -73
app.py CHANGED
@@ -4,9 +4,9 @@ import os
4
  import pandas as pd
5
  from typing import List, Tuple
6
 
7
- # LLM λͺ¨λΈ μ •μ˜
8
  LLM_MODELS = {
9
- "Cohere c4ai-crp-08-2024": "CohereForAI/c4ai-command-r-plus-08-2024", # κΈ°λ³Έ λͺ¨λΈ
10
  "Meta Llama3.3-70B": "meta-llama/Llama-3.3-70B-Instruct",
11
  "Mistral Nemo 2407": "mistralai/Mistral-Nemo-Instruct-2407",
12
  "Alibaba Qwen QwQ-32B": "Qwen/QwQ-32B-Preview"
@@ -16,19 +16,17 @@ def get_client(model_name):
16
  return InferenceClient(LLM_MODELS[model_name], token=os.getenv("HF_TOKEN"))
17
 
18
  def analyze_file_content(content, file_type):
19
- """파일 λ‚΄μš©μ„ λΆ„μ„ν•˜μ—¬ ꡬ쑰적 μš”μ•½μ„ λ°˜ν™˜"""
20
  if file_type in ['parquet', 'csv']:
21
  try:
22
- # 데이터셋 ꡬ쑰 뢄석
23
  lines = content.split('\n')
24
  header = lines[0]
25
  columns = header.count('|') - 1
26
- rows = len(lines) - 3 # 헀더와 ꡬ뢄선 μ œμ™Έ
27
- return f"데이터셋 ꡬ쑰: {columns}개 컬럼, {rows}개 데이터 μƒ˜ν”Œ"
28
  except:
29
- return "데이터셋 ꡬ쑰 뢄석 μ‹€νŒ¨"
30
 
31
- # ν…μŠ€νŠΈ/μ½”λ“œ 파일의 경우
32
  lines = content.split('\n')
33
  total_lines = len(lines)
34
  non_empty_lines = len([line for line in lines if line.strip()])
@@ -37,11 +35,11 @@ def analyze_file_content(content, file_type):
37
  functions = len([line for line in lines if 'def ' in line])
38
  classes = len([line for line in lines if 'class ' in line])
39
  imports = len([line for line in lines if 'import ' in line or 'from ' in line])
40
- return f"μ½”λ“œ ꡬ쑰 뢄석: 총 {total_lines}쀄 (ν•¨μˆ˜ {functions}개, 클래슀 {classes}개, μž„ν¬νŠΈ {imports}개)"
41
 
42
  paragraphs = content.count('\n\n') + 1
43
  words = len(content.split())
44
- return f"λ¬Έμ„œ ꡬ쑰 뢄석: 총 {total_lines}쀄, {paragraphs}개 문단, μ•½ {words}개 단어"
45
 
46
  def read_uploaded_file(file):
47
  if file is None:
@@ -54,32 +52,28 @@ def read_uploaded_file(file):
54
  content = df.head(10).to_markdown(index=False)
55
  return content, "parquet"
56
  elif file_ext == '.csv':
57
- # CSV 파일 읽기 μ‹œ λ‹€μ–‘ν•œ 인코딩 μ‹œλ„
58
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
59
  for encoding in encodings:
60
  try:
61
  df = pd.read_csv(file.name, encoding=encoding)
62
- content = f"데이터 미리보기:\n{df.head(10).to_markdown(index=False)}\n\n"
63
- content += f"\n데이터 정보:\n"
64
- content += f"- 총 ν–‰ 수: {len(df)}\n"
65
- content += f"- 총 μ—΄ 수: {len(df.columns)}\n"
66
- content += f"- 컬럼 λͺ©λ‘: {', '.join(df.columns)}\n"
67
- # 데이터 νƒ€μž… 정보 μΆ”κ°€
68
- content += f"\nμ»¬λŸΌλ³„ 데이터 νƒ€μž…:\n"
69
  for col, dtype in df.dtypes.items():
70
  content += f"- {col}: {dtype}\n"
71
- # 결츑치 정보 μΆ”κ°€
72
  null_counts = df.isnull().sum()
73
  if null_counts.any():
74
- content += f"\n결츑치 정보:\n"
75
  for col, null_count in null_counts[null_counts > 0].items():
76
- content += f"- {col}: {null_count}개\n"
77
  return content, "csv"
78
  except UnicodeDecodeError:
79
  continue
80
- raise UnicodeDecodeError(f"μ§€μ›λ˜λŠ” 인코딩({', '.join(encodings)})으둜 νŒŒμΌμ„ 읽을 수 μ—†μŠ΅λ‹ˆλ‹€.")
81
  else:
82
- # ν…μŠ€νŠΈ 파일 읽기 μ‹œλ„
83
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
84
  for encoding in encodings:
85
  try:
@@ -88,9 +82,9 @@ def read_uploaded_file(file):
88
  return content, "text"
89
  except UnicodeDecodeError:
90
  continue
91
- raise UnicodeDecodeError(f"μ§€μ›λ˜λŠ” 인코딩({', '.join(encodings)})으둜 νŒŒμΌμ„ 읽을 수 μ—†μŠ΅λ‹ˆλ‹€.")
92
  except Exception as e:
93
- return f"νŒŒμΌμ„ μ½λŠ” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}", "error"
94
 
95
  def format_history(history):
96
  formatted_history = []
@@ -101,17 +95,16 @@ def format_history(history):
101
  return formatted_history
102
 
103
  def chat(message, history, uploaded_file, model_name, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
104
- system_prefix = """λ„ˆλŠ” 파일 뢄석 μ „λ¬Έκ°€μž…λ‹ˆλ‹€. μ—…λ‘œλ“œλœ 파일의 λ‚΄μš©μ„ 깊이 있게 λΆ„μ„ν•˜μ—¬ λ‹€μŒκ³Ό 같은 κ΄€μ μ—μ„œ μ„€λͺ…ν•΄μ•Ό ν•©λ‹ˆλ‹€:
 
 
 
 
 
 
 
105
 
106
- 1. 파일의 μ „λ°˜μ μΈ ꡬ쑰와 ꡬ성
107
- 2. μ£Όμš” λ‚΄μš©κ³Ό νŒ¨ν„΄ 뢄석
108
- 3. λ°μ΄ν„°μ˜ νŠΉμ§•κ³Ό 의미
109
- - λ°μ΄ν„°μ…‹μ˜ 경우: 컬럼의 의미, 데이터 νƒ€μž…, κ°’μ˜ 뢄포
110
- - ν…μŠ€νŠΈ/μ½”λ“œμ˜ 경우: ꡬ쑰적 νŠΉμ§•, μ£Όμš” νŒ¨ν„΄
111
- 4. 잠재적 ν™œμš© λ°©μ•ˆ
112
- 5. 데이터 ν’ˆμ§ˆ 및 κ°œμ„  κ°€λŠ₯ν•œ λΆ€λΆ„
113
-
114
- 전문가적 κ΄€μ μ—μ„œ μƒμ„Έν•˜κ³  ꡬ쑰적인 뢄석을 μ œκ³΅ν•˜λ˜, μ΄ν•΄ν•˜κΈ° μ‰½κ²Œ μ„€λͺ…ν•˜μ„Έμš”. 뢄석 κ²°κ³ΌλŠ” Markdown ν˜•μ‹μœΌλ‘œ μž‘μ„±ν•˜κ³ , κ°€λŠ₯ν•œ ν•œ ꡬ체적인 μ˜ˆμ‹œλ₯Ό ν¬ν•¨ν•˜μ„Έμš”."""
115
 
116
  if uploaded_file:
117
  content, file_type = read_uploaded_file(uploaded_file)
@@ -119,24 +112,23 @@ def chat(message, history, uploaded_file, model_name, system_message="", max_tok
119
  yield "", history + [[message, content]]
120
  return
121
 
122
- # 파일 λ‚΄μš© 뢄석 및 ꡬ쑰적 μš”μ•½
123
  file_summary = analyze_file_content(content, file_type)
124
 
125
  if file_type in ['parquet', 'csv']:
126
- system_message += f"\n\n파일 λ‚΄μš©:\n```markdown\n{content}\n```"
127
  else:
128
- system_message += f"\n\n파일 λ‚΄μš©:\n```\n{content}\n```"
129
 
130
- if message == "파일 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€.":
131
- message = f"""[ꡬ쑰 뢄석] {file_summary}
132
 
133
- λ‹€μŒ κ΄€μ μ—μ„œ 상세 뢄석을 μ œκ³΅ν•΄μ£Όμ„Έμš”:
134
- 1. 파일의 μ „λ°˜μ μΈ ꡬ쑰와 ν˜•μ‹
135
- 2. μ£Όμš” λ‚΄μš© 및 κ΅¬μ„±μš”μ†Œ 뢄석
136
- 3. 데이터/λ‚΄μš©μ˜ νŠΉμ§•κ³Ό νŒ¨ν„΄
137
- 4. ν’ˆμ§ˆ 및 완성도 평가
138
- 5. κ°œμ„  κ°€λŠ₯ν•œ λΆ€λΆ„ μ œμ•ˆ
139
- 6. μ‹€μ œ ν™œμš© λ°©μ•ˆ 및 μΆ”μ²œμ‚¬ν•­"""
140
 
141
  messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
142
  messages.extend(format_history(history))
@@ -159,46 +151,56 @@ def chat(message, history, uploaded_file, model_name, system_message="", max_tok
159
  yield "", history + [[message, partial_message]]
160
 
161
  except Exception as e:
162
- error_msg = f"μΆ”λ‘  쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
163
  yield "", history + [[message, error_msg]]
164
 
165
  css = """
166
  footer {visibility: hidden}
167
  """
168
 
169
- with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
 
 
 
 
 
 
 
 
 
170
  with gr.Row():
171
  with gr.Column(scale=2):
172
- chatbot = gr.Chatbot(height=600)
173
  msg = gr.Textbox(
174
- label="λ©”μ‹œμ§€λ₯Ό μž…λ ₯ν•˜μ„Έμš”",
175
  show_label=False,
176
- placeholder="λ©”μ‹œμ§€λ₯Ό μž…λ ₯ν•˜μ„Έμš”...",
177
  container=False
178
  )
179
- clear = gr.ClearButton([msg, chatbot])
180
 
181
  with gr.Column(scale=1):
182
  model_name = gr.Radio(
183
  choices=list(LLM_MODELS.keys()),
184
- value="Cohere c4ai-crp-08-2024", # 기본값을 Cohere λͺ¨λΈλ‘œ λͺ…μ‹œμ  지정
185
- label="μ΅œμ‹  LLM λͺ¨λΈ 선택",
186
- info="μ‚¬μš©ν•  LLM λͺ¨λΈμ„ μ„ νƒν•˜μ„Έμš”"
187
  )
188
 
189
  file_upload = gr.File(
190
- label="파일 μ—…λ‘œλ“œ (ν…μŠ€νŠΈ, μ½”λ“œ, CSV, Parquet 파일)",
 
191
  file_types=["text", ".csv", ".parquet"],
192
  type="filepath"
193
  )
194
 
195
- with gr.Accordion("κ³ κΈ‰ μ„€μ •", open=False):
196
- system_message = gr.Textbox(label="System Message", value="")
197
- max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="Max Tokens")
198
- temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature")
199
- top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P")
200
 
201
- # 이벀트 바인딩
202
  msg.submit(
203
  chat,
204
  inputs=[msg, chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
@@ -210,26 +212,26 @@ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo:
210
  [msg]
211
  )
212
 
213
- # 파일 μ—…λ‘œλ“œ μ‹œ μžλ™ 뢄석
214
  file_upload.change(
215
  chat,
216
- inputs=[gr.Textbox(value="파일 뢄석을 μ‹œμž‘ν•©λ‹ˆλ‹€."), chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
217
  outputs=[msg, chatbot],
218
  queue=True
219
  )
220
 
221
- # 예제 μΆ”κ°€
222
  gr.Examples(
223
  examples=[
224
- ["파일의 μ „λ°˜μ μΈ ꡬ쑰와 νŠΉμ§•μ„ μžμ„Ένžˆ μ„€λͺ…ν•΄μ£Όμ„Έμš”."],
225
- ["이 파일의 μ£Όμš” νŒ¨ν„΄κ³Ό νŠΉμ§•μ„ λΆ„μ„ν•΄μ£Όμ„Έμš”."],
226
- ["파일의 ν’ˆμ§ˆκ³Ό κ°œμ„  κ°€λŠ₯ν•œ 뢀뢄을 ν‰κ°€ν•΄μ£Όμ„Έμš”."],
227
- ["이 νŒŒμΌμ„ μ‹€μ œλ‘œ μ–΄λ–»κ²Œ ν™œμš©ν•  수 μžˆμ„κΉŒμš”?"],
228
- ["파일의 μ£Όμš” λ‚΄μš©μ„ μš”μ•½ν•˜κ³  핡심 μΈμ‚¬μ΄νŠΈλ₯Ό λ„μΆœν•΄μ£Όμ„Έμš”."],
229
- ["이전 뢄석을 μ΄μ–΄μ„œ 더 μžμ„Ένžˆ μ„€λͺ…ν•΄μ£Όμ„Έμš”."],
230
  ],
231
  inputs=msg,
232
  )
233
 
234
  if __name__ == "__main__":
235
- demo.launch()
 
4
  import pandas as pd
5
  from typing import List, Tuple
6
 
7
+ # LLM Models Definition
8
  LLM_MODELS = {
9
+ "Cohere c4ai-crp-08-2024": "CohereForAI/c4ai-command-r-plus-08-2024", # Default
10
  "Meta Llama3.3-70B": "meta-llama/Llama-3.3-70B-Instruct",
11
  "Mistral Nemo 2407": "mistralai/Mistral-Nemo-Instruct-2407",
12
  "Alibaba Qwen QwQ-32B": "Qwen/QwQ-32B-Preview"
 
16
  return InferenceClient(LLM_MODELS[model_name], token=os.getenv("HF_TOKEN"))
17
 
18
  def analyze_file_content(content, file_type):
19
+ """Analyze file content and return structural summary"""
20
  if file_type in ['parquet', 'csv']:
21
  try:
 
22
  lines = content.split('\n')
23
  header = lines[0]
24
  columns = header.count('|') - 1
25
+ rows = len(lines) - 3
26
+ return f"πŸ“Š Dataset Structure: {columns} columns, {rows} data samples"
27
  except:
28
+ return "❌ Dataset structure analysis failed"
29
 
 
30
  lines = content.split('\n')
31
  total_lines = len(lines)
32
  non_empty_lines = len([line for line in lines if line.strip()])
 
35
  functions = len([line for line in lines if 'def ' in line])
36
  classes = len([line for line in lines if 'class ' in line])
37
  imports = len([line for line in lines if 'import ' in line or 'from ' in line])
38
+ return f"πŸ’» Code Structure: {total_lines} lines (Functions: {functions}, Classes: {classes}, Imports: {imports})"
39
 
40
  paragraphs = content.count('\n\n') + 1
41
  words = len(content.split())
42
+ return f"πŸ“ Document Structure: {total_lines} lines, {paragraphs} paragraphs, ~{words} words"
43
 
44
  def read_uploaded_file(file):
45
  if file is None:
 
52
  content = df.head(10).to_markdown(index=False)
53
  return content, "parquet"
54
  elif file_ext == '.csv':
 
55
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
56
  for encoding in encodings:
57
  try:
58
  df = pd.read_csv(file.name, encoding=encoding)
59
+ content = f"πŸ“Š Data Preview:\n{df.head(10).to_markdown(index=False)}\n\n"
60
+ content += f"\nπŸ“ˆ Data Information:\n"
61
+ content += f"- Total Rows: {len(df)}\n"
62
+ content += f"- Total Columns: {len(df.columns)}\n"
63
+ content += f"- Column List: {', '.join(df.columns)}\n"
64
+ content += f"\nπŸ“‹ Column Data Types:\n"
 
65
  for col, dtype in df.dtypes.items():
66
  content += f"- {col}: {dtype}\n"
 
67
  null_counts = df.isnull().sum()
68
  if null_counts.any():
69
+ content += f"\n⚠️ Missing Values:\n"
70
  for col, null_count in null_counts[null_counts > 0].items():
71
+ content += f"- {col}: {null_count} missing\n"
72
  return content, "csv"
73
  except UnicodeDecodeError:
74
  continue
75
+ raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})")
76
  else:
 
77
  encodings = ['utf-8', 'cp949', 'euc-kr', 'latin1']
78
  for encoding in encodings:
79
  try:
 
82
  return content, "text"
83
  except UnicodeDecodeError:
84
  continue
85
+ raise UnicodeDecodeError(f"❌ Unable to read file with supported encodings ({', '.join(encodings)})")
86
  except Exception as e:
87
+ return f"❌ Error reading file: {str(e)}", "error"
88
 
89
  def format_history(history):
90
  formatted_history = []
 
95
  return formatted_history
96
 
97
  def chat(message, history, uploaded_file, model_name, system_message="", max_tokens=4000, temperature=0.7, top_p=0.9):
98
+ system_prefix = """You are a file analysis expert. Analyze the uploaded file in depth from the following perspectives:
99
+ 1. πŸ“‹ Overall structure and composition
100
+ 2. πŸ“Š Key content and pattern analysis
101
+ 3. πŸ“ˆ Data characteristics and meaning
102
+ - For datasets: Column meanings, data types, value distributions
103
+ - For text/code: Structural features, main patterns
104
+ 4. πŸ’‘ Potential applications
105
+ 5. ✨ Data quality and areas for improvement
106
 
107
+ Provide detailed and structured analysis from an expert perspective, but explain in an easy-to-understand way. Format the analysis results in Markdown and include specific examples where possible."""
 
 
 
 
 
 
 
 
108
 
109
  if uploaded_file:
110
  content, file_type = read_uploaded_file(uploaded_file)
 
112
  yield "", history + [[message, content]]
113
  return
114
 
 
115
  file_summary = analyze_file_content(content, file_type)
116
 
117
  if file_type in ['parquet', 'csv']:
118
+ system_message += f"\n\nFile Content:\n```markdown\n{content}\n```"
119
  else:
120
+ system_message += f"\n\nFile Content:\n```\n{content}\n```"
121
 
122
+ if message == "Starting file analysis...":
123
+ message = f"""[Structure Analysis] {file_summary}
124
 
125
+ Please provide detailed analysis from these perspectives:
126
+ 1. πŸ“‹ Overall file structure and format
127
+ 2. πŸ“Š Key content and component analysis
128
+ 3. πŸ“ˆ Data/content characteristics and patterns
129
+ 4. ⭐ Quality and completeness evaluation
130
+ 5. πŸ’‘ Suggested improvements
131
+ 6. 🎯 Practical applications and recommendations"""
132
 
133
  messages = [{"role": "system", "content": f"{system_prefix} {system_message}"}]
134
  messages.extend(format_history(history))
 
151
  yield "", history + [[message, partial_message]]
152
 
153
  except Exception as e:
154
+ error_msg = f"❌ Inference error: {str(e)}"
155
  yield "", history + [[message, error_msg]]
156
 
157
  css = """
158
  footer {visibility: hidden}
159
  """
160
 
161
+ with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css, title="EveryChat πŸ€–") as demo:
162
+ gr.HTML(
163
+ """
164
+ <div style="text-align: center; max-width: 800px; margin: 0 auto;">
165
+ <h1 style="font-size: 3em; font-weight: 600; margin: 0.5em;">EveryChat πŸ€–</h1>
166
+ <h3 style="font-size: 1.2em; margin: 1em;">Your Intelligent File Analysis Assistant πŸ“Š</h3>
167
+ </div>
168
+ """
169
+ )
170
+
171
  with gr.Row():
172
  with gr.Column(scale=2):
173
+ chatbot = gr.Chatbot(height=600, label="Chat Interface πŸ’¬")
174
  msg = gr.Textbox(
175
+ label="Type your message",
176
  show_label=False,
177
+ placeholder="Ask me anything about the uploaded file... πŸ’­",
178
  container=False
179
  )
180
+ clear = gr.ClearButton([msg, chatbot], label="Clear Chat πŸ—‘οΈ")
181
 
182
  with gr.Column(scale=1):
183
  model_name = gr.Radio(
184
  choices=list(LLM_MODELS.keys()),
185
+ value="Cohere c4ai-crp-08-2024",
186
+ label="Select LLM Model πŸ€–",
187
+ info="Choose your preferred AI model"
188
  )
189
 
190
  file_upload = gr.File(
191
+ label="Upload File πŸ“",
192
+ info="Support: Text, Code, CSV, Parquet files",
193
  file_types=["text", ".csv", ".parquet"],
194
  type="filepath"
195
  )
196
 
197
+ with gr.Accordion("Advanced Settings βš™οΈ", open=False):
198
+ system_message = gr.Textbox(label="System Message πŸ“", value="")
199
+ max_tokens = gr.Slider(minimum=1, maximum=8000, value=4000, label="Max Tokens πŸ“Š")
200
+ temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature 🌑️")
201
+ top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P πŸ“ˆ")
202
 
203
+ # Event bindings
204
  msg.submit(
205
  chat,
206
  inputs=[msg, chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
 
212
  [msg]
213
  )
214
 
215
+ # Auto-analysis on file upload
216
  file_upload.change(
217
  chat,
218
+ inputs=[gr.Textbox(value="Starting file analysis..."), chatbot, file_upload, model_name, system_message, max_tokens, temperature, top_p],
219
  outputs=[msg, chatbot],
220
  queue=True
221
  )
222
 
223
+ # Example queries
224
  gr.Examples(
225
  examples=[
226
+ ["Please explain the overall structure and features of the file in detail πŸ“‹"],
227
+ ["Analyze the main patterns and characteristics of this file πŸ“Š"],
228
+ ["Evaluate the file's quality and potential improvements πŸ’‘"],
229
+ ["How can we practically utilize this file? 🎯"],
230
+ ["Summarize the main content and derive key insights ✨"],
231
+ ["Please continue with more detailed analysis πŸ“ˆ"],
232
  ],
233
  inputs=msg,
234
  )
235
 
236
  if __name__ == "__main__":
237
+ demo.launch()