import gradio as gr
import pandas as pd
import re
from collections import Counter
import os

def process_excel(file):
    # 엑셀 파일 읽기
    df = pd.read_excel(file.name)
    
    # D열의 데이터 추출
    product_names = df.iloc[:, 3].dropna()  # D열은 0부터 시작하므로 index는 3
    
    # 키워드 추출 및 빈도 계산
    all_keywords = []
    
    for name in product_names:
        # 특수문자 제거 및 공백 기준으로 분할
        words = re.sub(r'[^\w\s]', '', name).split()
        # 중복 제거
        unique_words = set(words)
        all_keywords.extend(unique_words)
    
    # 빈도 계산
    keyword_counts = Counter(all_keywords)
    
    # 결과를 데이터프레임으로 정리
    result_df = pd.DataFrame(keyword_counts.items(), columns=['Keyword', 'Frequency'])
    result_df = result_df.sort_values(by='Frequency', ascending=False).reset_index(drop=True)
    
    # 디렉토리 생성 확인 및 파일 저장
    output_dir = "output"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    output_file = os.path.join(output_dir, "keyword_counts.xlsx")
    result_df.to_excel(output_file, index=False)
    
    return output_file

# Gradio 인터페이스 정의
iface = gr.Interface(
    fn=process_excel, 
    inputs=gr.File(file_types=[".xlsx"]),  # 엑셀 파일만 업로드할 수 있게 설정
    outputs="file",
    title="Excel Keyword Extractor",
    description="엑셀 파일의 D열에서 키워드를 추출하고 빈도를 계산하여 새로운 엑셀 파일로 출력합니다."
)

if __name__ == "__main__":
    iface.launch()