btrunghieu
commited on
Commit
•
45d9117
1
Parent(s):
75392c3
Update app.py
Browse files
app.py
CHANGED
@@ -28,12 +28,14 @@ def parse_comments(response: Response) -> Dict:
|
|
28 |
try:
|
29 |
data = json.loads(response.text)
|
30 |
except json.JSONDecodeError:
|
|
|
31 |
return {"comments": [], "total_comments": 0}
|
32 |
|
33 |
comments_data = data.get("comments", [])
|
34 |
total_comments = data.get("total", 0)
|
35 |
|
36 |
if not comments_data:
|
|
|
37 |
return {"comments": [], "total_comments": total_comments}
|
38 |
|
39 |
parsed_comments = []
|
@@ -58,16 +60,19 @@ async def scrape_comments(post_id: int, comments_count: int = 20, max_comments:
|
|
58 |
}
|
59 |
return base_url + urlencode(params)
|
60 |
|
|
|
61 |
first_page = await client.get(form_api_url(0))
|
62 |
data = parse_comments(first_page)
|
63 |
comments_data = data["comments"]
|
64 |
total_comments = data["total_comments"]
|
65 |
|
66 |
if not comments_data:
|
|
|
67 |
return []
|
68 |
if max_comments and max_comments < total_comments:
|
69 |
total_comments = max_comments
|
70 |
|
|
|
71 |
_other_pages = [
|
72 |
client.get(form_api_url(cursor=cursor))
|
73 |
for cursor in range(comments_count, total_comments + comments_count, comments_count)
|
@@ -82,6 +87,8 @@ async def scrape_comments(post_id: int, comments_count: int = 20, max_comments:
|
|
82 |
if max_comments and len(comments_data) >= max_comments:
|
83 |
comments_data = comments_data[:max_comments]
|
84 |
break
|
|
|
|
|
85 |
return comments_data
|
86 |
|
87 |
class SentimentClassifier(nn.Module):
|
@@ -172,8 +179,6 @@ model = SentimentClassifier(n_classes=3)
|
|
172 |
model.to(device)
|
173 |
model.load_state_dict(torch.load('phobert_fold1.pth', map_location=torch.device('cpu')))
|
174 |
|
175 |
-
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
|
176 |
-
|
177 |
class_names = ['CLEAN', 'OFFENSIVE', 'HATE']
|
178 |
|
179 |
|
|
|
28 |
try:
|
29 |
data = json.loads(response.text)
|
30 |
except json.JSONDecodeError:
|
31 |
+
log.error(f"Failed to parse JSON response: {response.text}")
|
32 |
return {"comments": [], "total_comments": 0}
|
33 |
|
34 |
comments_data = data.get("comments", [])
|
35 |
total_comments = data.get("total", 0)
|
36 |
|
37 |
if not comments_data:
|
38 |
+
log.warning(f"No comments found in response: {response.text}")
|
39 |
return {"comments": [], "total_comments": total_comments}
|
40 |
|
41 |
parsed_comments = []
|
|
|
60 |
}
|
61 |
return base_url + urlencode(params)
|
62 |
|
63 |
+
log.info(f"Scraping comments from post ID: {post_id}")
|
64 |
first_page = await client.get(form_api_url(0))
|
65 |
data = parse_comments(first_page)
|
66 |
comments_data = data["comments"]
|
67 |
total_comments = data["total_comments"]
|
68 |
|
69 |
if not comments_data:
|
70 |
+
log.warning(f"No comments found for post ID {post_id}")
|
71 |
return []
|
72 |
if max_comments and max_comments < total_comments:
|
73 |
total_comments = max_comments
|
74 |
|
75 |
+
log.info(f"Scraping comments pagination, remaining {total_comments // comments_count - 1} more pages")
|
76 |
_other_pages = [
|
77 |
client.get(form_api_url(cursor=cursor))
|
78 |
for cursor in range(comments_count, total_comments + comments_count, comments_count)
|
|
|
87 |
if max_comments and len(comments_data) >= max_comments:
|
88 |
comments_data = comments_data[:max_comments]
|
89 |
break
|
90 |
+
|
91 |
+
log.success(f"Scraped {len(comments_data)} comments from post ID {post_id}")
|
92 |
return comments_data
|
93 |
|
94 |
class SentimentClassifier(nn.Module):
|
|
|
179 |
model.to(device)
|
180 |
model.load_state_dict(torch.load('phobert_fold1.pth', map_location=torch.device('cpu')))
|
181 |
|
|
|
|
|
182 |
class_names = ['CLEAN', 'OFFENSIVE', 'HATE']
|
183 |
|
184 |
|