import jmespath import asyncio import json from urllib.parse import urlencode from typing import List, Dict from httpx import AsyncClient, Response from loguru import logger as log import nest_asyncio import torch import torch.nn as nn from transformers import AutoModel from transformers import AutoTokenizer, AutoModelForSequenceClassification import gradio as gr client = AsyncClient( # enable http2 http2=True, headers={ "Accept-Language": "en-US,en;q=0.9", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "content-type": "application/json" }, ) def parse_comments(response: Response) -> Dict: try: data = json.loads(response.text) except json.JSONDecodeError: log.error(f"Failed to parse JSON response: {response.text}") return {"comments": [], "total_comments": 0} comments_data = data.get("comments", []) total_comments = data.get("total", 0) if not comments_data: log.warning(f"No comments found in response: {response.text}") return {"comments": [], "total_comments": total_comments} parsed_comments = [] for comment in comments_data: result = jmespath.search( """{ text: text }""", comment ) parsed_comments.append(result) return {"comments": parsed_comments, "total_comments": total_comments} async def scrape_comments(post_id: int, comments_count: int = 20, max_comments: int = None) -> List[Dict]: def form_api_url(cursor: int): base_url = "https://www.tiktok.com/api/comment/list/?" params = { "aweme_id": post_id, 'count': comments_count, 'cursor': cursor # the index to start from } return base_url + urlencode(params) log.info(f"Scraping comments from post ID: {post_id}") first_page = await client.get(form_api_url(0)) data = parse_comments(first_page) comments_data = data["comments"] total_comments = data["total_comments"] if not comments_data: log.warning(f"No comments found for post ID {post_id}") return [] if max_comments and max_comments < total_comments: total_comments = max_comments log.info(f"Scraping comments pagination, remaining {total_comments // comments_count - 1} more pages") _other_pages = [ client.get(form_api_url(cursor=cursor)) for cursor in range(comments_count, total_comments + comments_count, comments_count) ] for response in asyncio.as_completed(_other_pages): response = await response new_comments = parse_comments(response)["comments"] comments_data.extend(new_comments) # If we have reached or exceeded the maximum number of comments to scrape, stop the process if max_comments and len(comments_data) >= max_comments: comments_data = comments_data[:max_comments] break log.success(f"Scraped {len(comments_data)} comments from post ID {post_id}") return comments_data class SentimentClassifier(nn.Module): def __init__(self, n_classes): super(SentimentClassifier, self).__init__() self.bert = AutoModel.from_pretrained("vinai/phobert-base") self.drop = nn.Dropout(p=0.3) self.fc = nn.Linear(self.bert.config.hidden_size, n_classes) nn.init.normal_(self.fc.weight, std=0.02) nn.init.normal_(self.fc.bias, 0) def forward(self, input_ids, attention_mask): last_hidden_state, output = self.bert( input_ids=input_ids, attention_mask=attention_mask, return_dict=False # Dropout will errors if without this ) x = self.drop(output) x = self.fc(x) return x def infer(text, tokenizer, max_len=120): encoded_review = tokenizer.encode_plus( text, max_length=max_len, truncation=True, add_special_tokens=True, padding='max_length', return_attention_mask=True, return_token_type_ids=False, return_tensors='pt', ) input_ids = encoded_review['input_ids'].to(device) attention_mask = encoded_review['attention_mask'].to(device) output = model(input_ids, attention_mask) _, y_pred = torch.max(output, dim=1) return class_names[y_pred] async def predict_comments(video_id): comments = await scrape_comments( post_id=int(video_id), max_comments=2000, comments_count=20 ) predictions = [] for comment in comments: text = comment['text'] probs = infer(text, tokenizer) predictions.append({'comment': text, 'predictions': probs}) # Tính toán tỷ lệ phần trăm của mỗi nhãn total_comments = len(predictions) label_counts = [0, 0, 0] # Assuming there are 3 labels comment_off = [] comment_hate = [] for prediction in predictions: probs = prediction['predictions'] if probs == 'CLEAN': label_counts[0] += 1 elif probs == 'OFFENSIVE': label_counts[1] += 1 comment_off.append(prediction['comment']) else : label_counts[2] += 1 comment_hate.append(prediction['comment']) label_percentages = [count / total_comments * 100 for count in label_counts] results = { 'total_comments': total_comments, 'label_percentages': { 'CLEAN': label_percentages[0], 'OFFENSIVE': label_percentages[1], 'HATE': label_percentages[2], 'CMT OFFENSIVE': comment_off, 'CMT HATE': comment_hate, } } return results device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model = SentimentClassifier(n_classes=3) model.to(device) model.load_state_dict(torch.load('phobert_fold1.pth')) tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base") class_names = ['CLEAN', 'OFFENSIVE', 'HATE'] iface = gr.Interface( fn=predict_comments, inputs="text", outputs="json" ) iface.launch()