import jmespath
import asyncio
import json
from urllib.parse import urlencode
from typing import List, Dict
from httpx import AsyncClient, Response
from loguru import logger as log
import nest_asyncio
import torch
import torch.nn as nn
from transformers import AutoModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import gradio as gr

client = AsyncClient(
    # enable http2
    http2=True,
    headers={
        "Accept-Language": "en-US,en;q=0.9",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "content-type": "application/json"
    },
)

def parse_comments(response: Response) -> Dict:
    try:
        data = json.loads(response.text)
    except json.JSONDecodeError:
        log.error(f"Failed to parse JSON response: {response.text}")
        return {"comments": [], "total_comments": 0}

    comments_data = data.get("comments", [])
    total_comments = data.get("total", 0)

    if not comments_data:
        log.warning(f"No comments found in response: {response.text}")
        return {"comments": [], "total_comments": total_comments}

    parsed_comments = []
    for comment in comments_data:
        result = jmespath.search(
            """{
            text: text
            }""",
            comment
        )
        parsed_comments.append(result)
    return {"comments": parsed_comments, "total_comments": total_comments}

async def scrape_comments(post_id: int, comments_count: int = 20, max_comments: int = None) -> List[Dict]:
    
    def form_api_url(cursor: int):
        base_url = "https://www.tiktok.com/api/comment/list/?"
        params = {
            "aweme_id": post_id,
            'count': comments_count,
            'cursor': cursor # the index to start from      
        }
        return base_url + urlencode(params)
    
    log.info(f"Scraping comments from post ID: {post_id}")
    first_page = await client.get(form_api_url(0))
    data = parse_comments(first_page)
    comments_data = data["comments"]
    total_comments = data["total_comments"]

    if not comments_data:
        log.warning(f"No comments found for post ID {post_id}")
        return []
    if max_comments and max_comments < total_comments:
        total_comments = max_comments

    log.info(f"Scraping comments pagination, remaining {total_comments // comments_count - 1} more pages")
    _other_pages = [
        client.get(form_api_url(cursor=cursor))
        for cursor in range(comments_count, total_comments + comments_count, comments_count)
    ]

    for response in asyncio.as_completed(_other_pages):
        response = await response
        new_comments = parse_comments(response)["comments"]
        comments_data.extend(new_comments)
        
        # If we have reached or exceeded the maximum number of comments to scrape, stop the process
        if max_comments and len(comments_data) >= max_comments:
            comments_data = comments_data[:max_comments]
            break

    log.success(f"Scraped {len(comments_data)} comments from post ID {post_id}")
    return comments_data

class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.drop = nn.Dropout(p=0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        last_hidden_state, output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False # Dropout will errors if without this
        )

        x = self.drop(output)
        x = self.fc(x)
        return x

def infer(text, tokenizer, max_len=120):
    encoded_review = tokenizer.encode_plus(
        text,
        max_length=max_len,
        truncation=True,
        add_special_tokens=True,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=False,
        return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    output = model(input_ids, attention_mask)
    _, y_pred = torch.max(output, dim=1)

    return class_names[y_pred]

async def predict_comments(video_id):
    comments = await scrape_comments(
        post_id=int(video_id),
        max_comments=2000,
        comments_count=20
    )
    predictions = []
    for comment in comments:
        text = comment['text']
        probs = infer(text, tokenizer)
        predictions.append({'comment': text, 'predictions': probs})
    
    # Tính toán tỷ lệ phần trăm của mỗi nhãn
    total_comments = len(predictions)
    label_counts = [0, 0, 0]  # Assuming there are 3 labels
    comment_off = []
    comment_hate = []
    for prediction in predictions:
        probs = prediction['predictions']
        if probs == 'CLEAN':
            label_counts[0] += 1
        elif probs == 'OFFENSIVE':
            label_counts[1] += 1
            comment_off.append(prediction['comment'])
        else :
            label_counts[2] += 1
            comment_hate.append(prediction['comment'])

    label_percentages = [count / total_comments * 100 for count in label_counts]
    results = {
        'total_comments': total_comments,
        'label_percentages': {
            'CLEAN': label_percentages[0],
            'OFFENSIVE': label_percentages[1],
            'HATE': label_percentages[2],
            'CMT OFFENSIVE': comment_off,
            'CMT HATE': comment_hate,
        }
    }

    return results

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model = SentimentClassifier(n_classes=3)
model.to(device)
model.load_state_dict(torch.load('phobert_fold1.pth'))

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

class_names = ['CLEAN', 'OFFENSIVE', 'HATE']


iface = gr.Interface(
    fn=predict_comments,
    inputs="text",
    outputs="json"
)

iface.launch()