cuongnguyen910
commited on
Commit
•
5120311
1
Parent(s):
411c450
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .dockerignore +3 -0
- .gitignore +21 -0
- Dockerfile +46 -0
- Dockerfile_gpu +33 -0
- Dockerfile_gpu_Thien +34 -0
- Jenkinsfile +22 -0
- README.md +55 -12
- config/__init__.py +1 -0
- config/cfg.yaml +173 -0
- config/config.py +57 -0
- config/config.yml +38 -0
- consumer_clustering.py +181 -0
- consumer_clustering_mnews.py +137 -0
- consumer_hot_topic_ondemand.py +209 -0
- consumer_merge_clustering.py +96 -0
- consumer_merge_clustering_newscms.py +109 -0
- deployment.yaml +28 -0
- docker/api_trt/Dockerfile +22 -0
- docker/cpu/Dockerfile +46 -0
- docker/gpu/Dockerfile_gpu +33 -0
- docker_build.txt +6 -0
- entity/__init__.py +1 -0
- entity/__pycache__/__init__.cpython-37.pyc +0 -0
- entity/__pycache__/__init__.cpython-38.pyc +0 -0
- entity/__pycache__/types.cpython-37.pyc +0 -0
- entity/__pycache__/types.cpython-38.pyc +0 -0
- entity/types.py +0 -0
- function/SessionProcess.py +55 -0
- function/__init__.py +3 -0
- function/__pycache__/SessionProcess.cpython-37.pyc +0 -0
- function/__pycache__/SessionProcess.cpython-38.pyc +0 -0
- function/__pycache__/__init__.cpython-37.pyc +0 -0
- function/__pycache__/__init__.cpython-38.pyc +0 -0
- function/__pycache__/tc_v2.cpython-37.pyc +0 -0
- function/__pycache__/tc_v2.cpython-38.pyc +0 -0
- function/clean_text.py +84 -0
- function/detect_time.py +92 -0
- function/embed_vncorenlp.py +161 -0
- function/sentence_embbeding.py +41 -0
- function/summary_with_llm.py +210 -0
- function/tc_v2.py +573 -0
- function/topic_clustering.py +458 -0
- function/topic_clustering_mnews.py +339 -0
- function/topic_clustering_not_summary.py +463 -0
- function/topic_clustering_social.py +156 -0
- function/topic_clustering_v2.py +390 -0
- function/translate.py +37 -0
- function/utils.py +94 -0
- get_config.py +5 -0
- iclibs/ic_rabbit.py +126 -0
.dockerignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
*venv*
|
2 |
+
log
|
3 |
+
log_run
|
.gitignore
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
venv/
|
2 |
+
data/
|
3 |
+
vncore/
|
4 |
+
global/
|
5 |
+
cls/
|
6 |
+
check.json
|
7 |
+
test3.py
|
8 |
+
test2.py
|
9 |
+
time_test.py
|
10 |
+
vncorenlp
|
11 |
+
*venv*
|
12 |
+
__pycache__/
|
13 |
+
req_daily
|
14 |
+
log*
|
15 |
+
sample
|
16 |
+
model
|
17 |
+
*.json
|
18 |
+
*test*
|
19 |
+
docker_venv
|
20 |
+
core
|
21 |
+
models
|
Dockerfile
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.7
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
|
6 |
+
RUN apt-get update && apt-get install build-essential cmake git -y
|
7 |
+
|
8 |
+
#----------------JRE (for vncorenlp)--------------
|
9 |
+
RUN apt-get update && \
|
10 |
+
DEBIAN_FRONTEND=noninteractive \
|
11 |
+
apt-get -y install default-jre-headless && \
|
12 |
+
apt-get clean && \
|
13 |
+
rm -rf /var/lib/apt/lists/*
|
14 |
+
|
15 |
+
RUN apt-get install unzip
|
16 |
+
RUN pip install gdown
|
17 |
+
RUN gdown --id 1MTAPYy9AcYtfiJ6m_pz6MPeA6li8pYp7
|
18 |
+
RUN unzip vncorenlp.zip -d /app/
|
19 |
+
|
20 |
+
|
21 |
+
# COPY ./model /app/model
|
22 |
+
|
23 |
+
RUN mkdir -p /app/model
|
24 |
+
RUN mkdir -p /app/log
|
25 |
+
RUN mkdir -p /app/log_run
|
26 |
+
|
27 |
+
COPY reqs_cpu.txt /app/
|
28 |
+
RUN pip install -r reqs_cpu.txt
|
29 |
+
|
30 |
+
COPY ./load_model.py /app/
|
31 |
+
RUN python load_model.py
|
32 |
+
|
33 |
+
COPY ./config /app/config
|
34 |
+
COPY ./entity /app/entity
|
35 |
+
COPY ./function /app/function
|
36 |
+
COPY ./main_cache.py /app/
|
37 |
+
COPY ./service_cache.py /app/
|
38 |
+
COPY ./summary.py /app/
|
39 |
+
COPY ./merge_topic.py /app/
|
40 |
+
COPY ./consumer_clustering.py /app/
|
41 |
+
COPY ./consumer_merge_clustering.py /app/
|
42 |
+
COPY ./run_multi_process.py /app/
|
43 |
+
|
44 |
+
RUN rm -r ~/.cache/pip/*
|
45 |
+
|
46 |
+
CMD ["python", "run_multi_process.py"]
|
Dockerfile_gpu
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.7
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
|
6 |
+
RUN apt-get update && apt-get install build-essential cmake git -y
|
7 |
+
|
8 |
+
#----------------JRE (for vncorenlp)--------------
|
9 |
+
RUN apt-get update && \
|
10 |
+
DEBIAN_FRONTEND=noninteractive \
|
11 |
+
apt-get -y install default-jre-headless && \
|
12 |
+
apt-get clean && \
|
13 |
+
rm -rf /var/lib/apt/lists/*
|
14 |
+
|
15 |
+
COPY ./model /app/model
|
16 |
+
|
17 |
+
RUN pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
|
18 |
+
RUN mkdir log
|
19 |
+
RUN mkdir log_run
|
20 |
+
|
21 |
+
COPY req.txt /app/
|
22 |
+
RUN pip install -r req.txt
|
23 |
+
|
24 |
+
COPY ./entity /app/entity
|
25 |
+
COPY ./function /app/function
|
26 |
+
COPY ./vncorenlp /app/vncorenlp
|
27 |
+
COPY ./main_cache.py /app/
|
28 |
+
COPY ./service_cache.py /app/
|
29 |
+
COPY ./summary.py /app/
|
30 |
+
|
31 |
+
RUN rm -r ~/.cache/pip/*
|
32 |
+
|
33 |
+
CMD ["python", "main_cache.py"]
|
Dockerfile_gpu_Thien
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.7.17-slim-bullseye
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
|
6 |
+
RUN apt-get update && apt-get install build-essential cmake git -y
|
7 |
+
|
8 |
+
#----------------JRE (for vncorenlp)--------------
|
9 |
+
RUN apt-get update && \
|
10 |
+
DEBIAN_FRONTEND=noninteractive \
|
11 |
+
apt-get -y install default-jre-headless && \
|
12 |
+
apt-get clean && \
|
13 |
+
rm -rf /var/lib/apt/lists/*
|
14 |
+
|
15 |
+
COPY ./model /app/model
|
16 |
+
COPY ./tensorRT /app/tensorRT
|
17 |
+
|
18 |
+
RUN pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
|
19 |
+
RUN mkdir log
|
20 |
+
RUN mkdir log_run
|
21 |
+
|
22 |
+
COPY req.txt /app/
|
23 |
+
RUN pip install -r req.txt
|
24 |
+
|
25 |
+
COPY ./entity /app/entity
|
26 |
+
COPY ./function /app/function
|
27 |
+
COPY ./vncorenlp /app/vncorenlp
|
28 |
+
COPY ./main_cache_Thien.py /app/
|
29 |
+
COPY ./service_cache_Thien.py /app/
|
30 |
+
COPY ./summary.py /app/
|
31 |
+
|
32 |
+
RUN rm -r ~/.cache/pip/*
|
33 |
+
|
34 |
+
CMD ["python", "main_cache_Thien.py"]
|
Jenkinsfile
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
node {
|
2 |
+
checkout scm
|
3 |
+
|
4 |
+
env.DOCKER_API_VERSION="1.23"
|
5 |
+
registry_host = env.registry_host
|
6 |
+
appName = "clusteringcpu"
|
7 |
+
|
8 |
+
sh "git rev-parse --short HEAD > commit-id"
|
9 |
+
tag = readFile('commit-id').replace("\n", "").replace("\r", "")
|
10 |
+
|
11 |
+
stage("build"){
|
12 |
+
sh "docker build --tag ${registry_host}/${appName}:${tag} --file Dockerfile ."
|
13 |
+
}
|
14 |
+
stage("push"){
|
15 |
+
sh "docker push ${registry_host}/${appName}:${tag}"
|
16 |
+
}
|
17 |
+
stage("deploy"){
|
18 |
+
sh "sed -i s/{{tag}}/${tag}/g deployment.yaml \
|
19 |
+
&& sed -i 's|{{registry_host}}|${registry_host}|g' deployment.yaml \
|
20 |
+
&& kubectl ${env.token_kube} apply -f deployment.yaml"
|
21 |
+
}
|
22 |
+
}
|
README.md
CHANGED
@@ -1,12 +1,55 @@
|
|
1 |
-
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: topic-clustering-global-dashboard
|
3 |
+
app_file: service_periodic.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 4.43.0
|
6 |
+
---
|
7 |
+
## Build docker
|
8 |
+
```
|
9 |
+
docker build -t clustering-dashboard .
|
10 |
+
docker run -d --restart=always --name clustering-dashboard clustering-dashboard
|
11 |
+
```
|
12 |
+
|
13 |
+
## Deploy TRT
|
14 |
+
```
|
15 |
+
scp -r docker/api_trt/Dockerfile ./
|
16 |
+
docker build -t api-clustering .
|
17 |
+
docker run --gpus 1 --dns 8.8.8.8 -it -p 8633:8633 -v /home/dp04/topic-clustering-global-dashboard:/app -t --name api-clustering api-clustering
|
18 |
+
```
|
19 |
+
|
20 |
+
### Topic Clustering
|
21 |
+
1. API clustering port 8633: dùng cho phân cụm NEWS trong các loại báo cáo ( báo cáo nhanh, báo cáo tổng quan, báo cáo template,...)
|
22 |
+
- Command:
|
23 |
+
```
|
24 |
+
docker run --gpus all -it --rm -v /home/vietle/topic-clustering:/home/vietle/topic-clustering --name topic_clustering_trt_sb -p 8636:8633 topic-clustering-trt
|
25 |
+
cd /home/vietle/topic-clustering && CUDA_VISIBLE_DEVICES=<cuda_device_id> python main_cache.py
|
26 |
+
```
|
27 |
+
- screen: 52097.clustering.trt
|
28 |
+
2. API clustering social: Phân cụm tin tức social
|
29 |
+
- Command: cd /home/vietle/topic-clustering/ && source docker_venv/bin/activate && uvicorn service_social:app --host 0.0.0.0 --port 8635
|
30 |
+
- Screen: 37529.clustering.social
|
31 |
+
3. Phân cụm cho global dashboard:
|
32 |
+
- API:
|
33 |
+
```
|
34 |
+
CMD: systemctl status bzo_clustering_api_cpu.service (port 8634)
|
35 |
+
Screen: 21445.clustering.cpu
|
36 |
+
```
|
37 |
+
- Consumer:
|
38 |
+
+ Phân cụm daily:
|
39 |
+
```
|
40 |
+
Command: cd /home/vietle/topic-clustering/ && source docker_venv/bin/activate && OMP_NUM_THREADS=16 CUDA_VISIBLE_DEVICES=-1 python consumer_clustering.py
|
41 |
+
Screen: 16024.bzo.clustering_daily
|
42 |
+
```
|
43 |
+
+ Phân cụm weekly/monthly:
|
44 |
+
```
|
45 |
+
Command: cd /home/vietle/topic-clustering/ && source docker_venv/bin/activate && OMP_NUM_THREADS=16 CUDA_VISIBLE_DEVICES='-1' python consumer_merge_clustering.py
|
46 |
+
Screen: 60767.bzo.clustering_monthly
|
47 |
+
```
|
48 |
+
|
49 |
+
4. Phân cụm điểm tin:
|
50 |
+
- Command:
|
51 |
+
```
|
52 |
+
docker run --gpus all -it --rm -v /home/vietle/topic-clustering:/home/vietle/topic-clustering --name topic_clustering_trt_sb -p 8636:8633 topic-clustering-trt
|
53 |
+
cd /home/vietle/topic-clustering/ && python main_cache_Thien.py
|
54 |
+
```
|
55 |
+
|
config/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .config import get_config, parse_connection_string
|
config/cfg.yaml
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AppSettings:
|
2 |
+
ConnectionStrings:
|
3 |
+
facebook_info: Server=10.9.3.141,1433; Database=facebook_info; User Id='facebook_info_read'; Password='dh2uxJny'; Pooling=True; Connect Timeout=45; MultipleActiveResultSets=True; Encrypt=False
|
4 |
+
GraylogConfig:
|
5 |
+
ConfigManager:
|
6 |
+
ChannelConfigs:
|
7 |
+
MemcachedConfigs:
|
8 |
+
QueueConfigs:
|
9 |
+
queue_topic_clustering:
|
10 |
+
Arguments: {}
|
11 |
+
AutoAck: false
|
12 |
+
AutoDelete: false
|
13 |
+
AutomaticRecoveryEnabled: true
|
14 |
+
ConsumerMaxRetry: 0
|
15 |
+
Durable: true
|
16 |
+
ErrorCounter: 100
|
17 |
+
ErrorCounterTotalSeconds: 60
|
18 |
+
Exchange: ''
|
19 |
+
ExchangeArguments:
|
20 |
+
ExchangeAutoDelete: false
|
21 |
+
ExchangeDurable: true
|
22 |
+
ExchangeType:
|
23 |
+
Exclusive: false
|
24 |
+
FailedQueue:
|
25 |
+
HostName: 10.9.3.251
|
26 |
+
HostNames:
|
27 |
+
- 10.9.3.251
|
28 |
+
Servers:
|
29 |
+
- Host: 10.9.3.251
|
30 |
+
Port: 15672
|
31 |
+
Key: queue1
|
32 |
+
MaxWorkpool: 1
|
33 |
+
NetworkRecoveryInterval: 120
|
34 |
+
Password: 1
|
35 |
+
PrefetchCount: 200
|
36 |
+
Queue: topic-clustering
|
37 |
+
QueueBindArguments:
|
38 |
+
QueueBindRoutingKey:
|
39 |
+
RequestedHeartbeat: 120
|
40 |
+
TopologyRecoveryEnabled: false
|
41 |
+
UserName: long.nguyen
|
42 |
+
VirtualHost: posts-broadcast
|
43 |
+
MessageDeliveryMode:
|
44 |
+
IsProducer: true
|
45 |
+
IsConsumer: true
|
46 |
+
queue_merge_clustering:
|
47 |
+
Arguments: {}
|
48 |
+
AutoAck: false
|
49 |
+
AutoDelete: false
|
50 |
+
AutomaticRecoveryEnabled: true
|
51 |
+
ConsumerMaxRetry: 0
|
52 |
+
Durable: true
|
53 |
+
ErrorCounter: 100
|
54 |
+
ErrorCounterTotalSeconds: 60
|
55 |
+
Exchange: ''
|
56 |
+
ExchangeArguments:
|
57 |
+
ExchangeAutoDelete: false
|
58 |
+
ExchangeDurable: true
|
59 |
+
ExchangeType:
|
60 |
+
Exclusive: false
|
61 |
+
FailedQueue:
|
62 |
+
HostName: 10.9.3.251
|
63 |
+
HostNames:
|
64 |
+
- 10.9.3.251
|
65 |
+
Servers:
|
66 |
+
- Host: 10.9.3.251
|
67 |
+
Port: 15672
|
68 |
+
Key: queue2
|
69 |
+
MaxWorkpool: 1
|
70 |
+
NetworkRecoveryInterval: 120
|
71 |
+
Password: 1
|
72 |
+
PrefetchCount: 200
|
73 |
+
Queue: merge-clustering
|
74 |
+
QueueBindArguments:
|
75 |
+
QueueBindRoutingKey:
|
76 |
+
RequestedHeartbeat: 120
|
77 |
+
TopologyRecoveryEnabled: false
|
78 |
+
UserName: long.nguyen
|
79 |
+
VirtualHost: posts-broadcast
|
80 |
+
MessageDeliveryMode:
|
81 |
+
IsProducer: true
|
82 |
+
IsConsumer: true
|
83 |
+
queue_topic_clustering_mnews:
|
84 |
+
Arguments: {}
|
85 |
+
AutoAck: false
|
86 |
+
AutoDelete: false
|
87 |
+
AutomaticRecoveryEnabled: true
|
88 |
+
ConsumerMaxRetry: 0
|
89 |
+
Durable: true
|
90 |
+
ErrorCounter: 100
|
91 |
+
ErrorCounterTotalSeconds: 60
|
92 |
+
Exchange: ''
|
93 |
+
ExchangeArguments:
|
94 |
+
ExchangeAutoDelete: false
|
95 |
+
ExchangeDurable: true
|
96 |
+
ExchangeType:
|
97 |
+
Exclusive: false
|
98 |
+
FailedQueue:
|
99 |
+
HostName: 10.9.3.251
|
100 |
+
HostNames:
|
101 |
+
- 10.9.3.251
|
102 |
+
Servers:
|
103 |
+
- Host: 10.9.3.251
|
104 |
+
Port: 15672
|
105 |
+
Key: queue1
|
106 |
+
MaxWorkpool: 1
|
107 |
+
NetworkRecoveryInterval: 120
|
108 |
+
Password: 1
|
109 |
+
PrefetchCount: 200
|
110 |
+
Queue: topic-clustering-mnews
|
111 |
+
QueueBindArguments:
|
112 |
+
QueueBindRoutingKey:
|
113 |
+
RequestedHeartbeat: 120
|
114 |
+
TopologyRecoveryEnabled: false
|
115 |
+
UserName: long.nguyen
|
116 |
+
VirtualHost: posts-broadcast
|
117 |
+
MessageDeliveryMode:
|
118 |
+
IsProducer: true
|
119 |
+
IsConsumer: true
|
120 |
+
queue_merge_clustering_newscms:
|
121 |
+
Arguments: {}
|
122 |
+
AutoAck: false
|
123 |
+
AutoDelete: false
|
124 |
+
AutomaticRecoveryEnabled: true
|
125 |
+
ConsumerMaxRetry: 0
|
126 |
+
Durable: true
|
127 |
+
ErrorCounter: 100
|
128 |
+
ErrorCounterTotalSeconds: 60
|
129 |
+
Exchange: ''
|
130 |
+
ExchangeArguments:
|
131 |
+
ExchangeAutoDelete: false
|
132 |
+
ExchangeDurable: true
|
133 |
+
ExchangeType: fanout
|
134 |
+
Exclusive: false
|
135 |
+
FailedQueue:
|
136 |
+
HostName: 10.9.3.251
|
137 |
+
HostNames:
|
138 |
+
- 10.9.3.251
|
139 |
+
Servers:
|
140 |
+
- Host: 10.9.3.251
|
141 |
+
Port: 5672
|
142 |
+
Key: queue_merge_clustering_newscms
|
143 |
+
MaxWorkpool: 1
|
144 |
+
NetworkRecoveryInterval: 120
|
145 |
+
Password: 1
|
146 |
+
PrefetchCount: 200
|
147 |
+
Queue: newscms-merge-clustering
|
148 |
+
QueueBindArguments:
|
149 |
+
QueueBindRoutingKey:
|
150 |
+
RequestedHeartbeat: 120
|
151 |
+
TopologyRecoveryEnabled: false
|
152 |
+
UserName: long.nguyen
|
153 |
+
VirtualHost: news-cms
|
154 |
+
MessageDeliveryMode:
|
155 |
+
IsProducer: true
|
156 |
+
IsConsumer: true
|
157 |
+
AppSettings:
|
158 |
+
ApiConnects:
|
159 |
+
api_save_clustering:
|
160 |
+
BaseUrl: https://staging.pontusinc.com/api/category_management/v1/topic-clustering-dashboard/insert
|
161 |
+
IsMonitor: true
|
162 |
+
MonitorUrl: https://staging.pontusinc.com/api/category_management/v1/topic-clustering-dashboard/insert
|
163 |
+
api_sbert:
|
164 |
+
BaseUrl: http://10.9.3.240:6789/sbert/encode_list
|
165 |
+
IsMonitor: true
|
166 |
+
MonitorUrl: http://10.9.3.240:6789/sbert/encode_list
|
167 |
+
|
168 |
+
api_save_cluster_newscms:
|
169 |
+
BaseUrl: https://staging.pontusinc.com/api/news_cms/News/update_cluster_result
|
170 |
+
IsMonitor: true
|
171 |
+
MonitorUrl: https://staging.pontusinc.com/api/news_cms/News/update_cluster_result
|
172 |
+
GraylogConfig:
|
173 |
+
key_test: test
|
config/config.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from yaml import load
|
5 |
+
try:
|
6 |
+
from yaml import CLoader as Loader, CDumper as Dumper
|
7 |
+
except ImportError:
|
8 |
+
from yaml import Loader, Dumper
|
9 |
+
|
10 |
+
|
11 |
+
URL_CFG = "http://icomm-api-configserver/api/configserver/v1/configuration.yaml"
|
12 |
+
# http://10.9.2.151:31244/api/configserver/v1/configuration.yaml
|
13 |
+
# cấu hình file hosts ở thư mục C:\Windows\System32\drivers\etc như sau:
|
14 |
+
# 123.31.42.17 icomm-api-configserver
|
15 |
+
|
16 |
+
AccessToken = "wbecrEfJk8F36y0WojqBQaqT28d6NaBnCLBgkoO2sCg3aNhYACkSxMNvWwlsAj5k"
|
17 |
+
Environment = "Production"
|
18 |
+
path_save_cfg = "config/cfg.yaml"
|
19 |
+
|
20 |
+
|
21 |
+
def get_config():
|
22 |
+
cfg = None
|
23 |
+
try:
|
24 |
+
payload = json.dumps({
|
25 |
+
"AccessToken": AccessToken,
|
26 |
+
"Environment": Environment
|
27 |
+
})
|
28 |
+
headers = {
|
29 |
+
'accept': 'text/plain',
|
30 |
+
'Content-Type': 'application/json-patch+json'
|
31 |
+
}
|
32 |
+
|
33 |
+
response = requests.request("POST", URL_CFG, headers=headers, data=payload)
|
34 |
+
# if response.status_code == 200:
|
35 |
+
# with open(path_save_cfg, "w+") as f:
|
36 |
+
# f.write(response.text)
|
37 |
+
except Exception as ve:
|
38 |
+
print(ve)
|
39 |
+
if os.path.exists(path_save_cfg):
|
40 |
+
with open(path_save_cfg) as f:
|
41 |
+
cfg = load(f, Loader)
|
42 |
+
return cfg
|
43 |
+
|
44 |
+
|
45 |
+
def parse_connection_string(str_cnn):
|
46 |
+
res = dict()
|
47 |
+
split_dt = str_cnn.split(";")
|
48 |
+
for c_sp in split_dt:
|
49 |
+
k, v = c_sp.split("=")
|
50 |
+
res[k.strip()] = v.replace("'", "").replace('"', '')
|
51 |
+
return res
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == '__main__':
|
55 |
+
cf = get_config()
|
56 |
+
print(cf)
|
57 |
+
print(parse_connection_string(cf["ConfigManager"]["ConnectionStrings"]["facebook_info"]["Value"]))
|
config/config.yml
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
queue_topic_clustering:
|
2 |
+
host:
|
3 |
+
10.9.3.251
|
4 |
+
virtual_host:
|
5 |
+
posts-broadcast
|
6 |
+
queue_name:
|
7 |
+
topic-clustering
|
8 |
+
usr_name:
|
9 |
+
long.nguyen
|
10 |
+
password:
|
11 |
+
1
|
12 |
+
|
13 |
+
queue_topic_clustering_mnews:
|
14 |
+
host:
|
15 |
+
10.9.3.251
|
16 |
+
virtual_host:
|
17 |
+
posts-broadcast
|
18 |
+
queue_name:
|
19 |
+
topic-clustering-mnews
|
20 |
+
usr_name:
|
21 |
+
long.nguyen
|
22 |
+
password:
|
23 |
+
1
|
24 |
+
|
25 |
+
queue_merge_clustering:
|
26 |
+
host:
|
27 |
+
10.9.3.251
|
28 |
+
virtual_host:
|
29 |
+
posts-broadcast
|
30 |
+
queue_name:
|
31 |
+
merge-clustering
|
32 |
+
usr_name:
|
33 |
+
long.nguyen
|
34 |
+
password:
|
35 |
+
1
|
36 |
+
|
37 |
+
api_save_clustering:
|
38 |
+
https://staging.pontusinc.com/api/category_management/v1/topic-clustering-dashboard/insert
|
consumer_clustering.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pika
|
2 |
+
import os
|
3 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
4 |
+
|
5 |
+
import json
|
6 |
+
import time
|
7 |
+
# from get_config import config_params
|
8 |
+
from config import get_config
|
9 |
+
from function import topic_clustering_not_summary as tc
|
10 |
+
from function import topic_clustering_social
|
11 |
+
import requests
|
12 |
+
|
13 |
+
config_params = get_config()
|
14 |
+
ConfigManager = config_params['ConfigManager']
|
15 |
+
|
16 |
+
last_time_check = time.time()
|
17 |
+
def update_result(result, type='daily', meta = {}):
|
18 |
+
benchmark_children_id = -1
|
19 |
+
benchmark_id = -1
|
20 |
+
source_tagids = []
|
21 |
+
for id_cluster in result:
|
22 |
+
for doc in result[id_cluster][:1]:
|
23 |
+
source_tagids = doc.get('source_tagids',[])
|
24 |
+
for key in doc:
|
25 |
+
if "benchmark_child" in key:
|
26 |
+
benchmark_children_id = int(key.lstrip('benchmark_child_'))
|
27 |
+
if "benchmark" in key and 'child' not in key:
|
28 |
+
benchmark_id = int(key.lstrip('benchmark_'))
|
29 |
+
break
|
30 |
+
|
31 |
+
if not source_tagids:
|
32 |
+
source_tagids = []
|
33 |
+
|
34 |
+
if len(source_tagids) > 0:
|
35 |
+
benchmark_id = 0
|
36 |
+
benchmark_children_id = 0
|
37 |
+
|
38 |
+
output = {
|
39 |
+
"benchmark_id": benchmark_id,
|
40 |
+
"benchmark_children_id": benchmark_children_id,
|
41 |
+
"source_tagids": source_tagids,
|
42 |
+
"country_code": meta.get('country_code',''),
|
43 |
+
"type": type,
|
44 |
+
"data": json.dumps(result)
|
45 |
+
}
|
46 |
+
# with open('test_result.json','w') as f:
|
47 |
+
# json.dump(output, f, ensure_ascii=False)
|
48 |
+
|
49 |
+
# url = config_params['api_save_clustering']
|
50 |
+
url = ConfigManager['ApiConnects']['api_save_clustering']['BaseUrl']
|
51 |
+
# with open("/home/vietle/topic-clustering/config/save.json", 'w') as f:
|
52 |
+
# json.dump(output, f,ensure_ascii=False)
|
53 |
+
res = requests.post(url, json = output)
|
54 |
+
print(res.text)
|
55 |
+
print('Update result !!!!!!!!!')
|
56 |
+
|
57 |
+
def callback_func(ch, method, properties, body):
|
58 |
+
print("receive done: ")
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
starttime = time.time()
|
63 |
+
body = json.loads(body.decode("utf-8"))
|
64 |
+
|
65 |
+
# with open('input_daily.json','w') as f:
|
66 |
+
# json.dump(body, f, ensure_ascii=False)
|
67 |
+
|
68 |
+
docs = body['docs']
|
69 |
+
# threshold = body['threshold']
|
70 |
+
threshold = 0.25
|
71 |
+
top_cluster = body['top_cluster']
|
72 |
+
top_sentence = body['top_sentence']
|
73 |
+
topn_summary = body['topn_summary']
|
74 |
+
hash_str = body['hash_str']
|
75 |
+
st_time = body['st_time']
|
76 |
+
meta = body.get('meta',{})
|
77 |
+
country_code = meta.get("country_code", "")
|
78 |
+
delete_message = False if country_code in ["ICOMM-RND","SOCIAL"] else True
|
79 |
+
|
80 |
+
print("country_code: ", country_code, "meta: ", meta)
|
81 |
+
|
82 |
+
is_cache = False
|
83 |
+
try:
|
84 |
+
with open("log_run/log.txt") as f:
|
85 |
+
data_dict = json.load(f)
|
86 |
+
except Exception as ve:
|
87 |
+
print(ve)
|
88 |
+
data_dict = {}
|
89 |
+
|
90 |
+
try:
|
91 |
+
if hash_str in data_dict:
|
92 |
+
path_res = data_dict[hash_str]["response_path"]
|
93 |
+
with open(path_res) as ff:
|
94 |
+
results = json.load(ff)
|
95 |
+
print("time analysis (cache): ", time.time() - st_time)
|
96 |
+
update_result(results,meta=meta)
|
97 |
+
is_cache = True
|
98 |
+
except Exception as vee:
|
99 |
+
print(vee)
|
100 |
+
|
101 |
+
if not is_cache:
|
102 |
+
if country_code in ["SOCIAL"]:
|
103 |
+
results = topic_clustering_social.topic_clustering(docs, threshold, top_cluster=top_cluster, top_sentence=top_sentence,
|
104 |
+
topn_summary=topn_summary, delete_message=delete_message)
|
105 |
+
else:
|
106 |
+
results = tc.topic_clustering(docs, threshold, top_cluster=top_cluster, top_sentence=top_sentence,
|
107 |
+
topn_summary=topn_summary, delete_message=delete_message)
|
108 |
+
update_result(results, meta=meta)
|
109 |
+
|
110 |
+
path_res = "log/result_{0}.txt".format(hash_str)
|
111 |
+
with open(path_res, "w+") as ff:
|
112 |
+
ff.write(json.dumps(results))
|
113 |
+
|
114 |
+
data_dict[hash_str] = {"time": st_time, "response_path": path_res}
|
115 |
+
|
116 |
+
lst_rm = []
|
117 |
+
global last_time_check
|
118 |
+
if time.time() - last_time_check > 3600:
|
119 |
+
print("check log to del .....")
|
120 |
+
last_time_check = time.time()
|
121 |
+
for dt in data_dict:
|
122 |
+
if time.time() - data_dict[dt]["time"] > 30 * 24 * 3600:
|
123 |
+
lst_rm.append(dt)
|
124 |
+
for dt in lst_rm:
|
125 |
+
del data_dict[dt]
|
126 |
+
with open("log_run/log.txt", "w+") as ff:
|
127 |
+
ff.write(json.dumps(data_dict))
|
128 |
+
print("time analysis: ", time.time() - starttime)
|
129 |
+
ch.basic_ack(delivery_tag=method.delivery_tag)
|
130 |
+
|
131 |
+
|
132 |
+
def test():
|
133 |
+
with open('req_daily/aus.json') as f:
|
134 |
+
body = json.load(f)
|
135 |
+
|
136 |
+
docs = body['response']['docs']
|
137 |
+
# threshold = body['threshold']
|
138 |
+
threshold = 0.25
|
139 |
+
top_cluster = body['top_cluster']
|
140 |
+
top_sentence = body['top_sentence']
|
141 |
+
topn_summary = body['topn_summary']
|
142 |
+
# hash_str = body['hash_str']
|
143 |
+
# st_time = body['st_time']
|
144 |
+
meta = body['response'].get('meta',{})
|
145 |
+
results = tc.topic_clustering(docs, threshold, top_cluster=top_cluster, top_sentence=top_sentence,
|
146 |
+
topn_summary=topn_summary, delete_message=True)
|
147 |
+
print(results)
|
148 |
+
# update_result(results, meta=meta)
|
149 |
+
# print(123)
|
150 |
+
if __name__ == '__main__':
|
151 |
+
# test()
|
152 |
+
params = ConfigManager['QueueConfigs']['queue_topic_clustering']
|
153 |
+
usr_name = params["UserName"]
|
154 |
+
password = str(params["Password"])
|
155 |
+
host = params["HostName"]
|
156 |
+
virtual_host = params["VirtualHost"]
|
157 |
+
queue_name = params["Queue"]
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
# params = config_params['queue_topic_clustering']
|
162 |
+
# usr_name = params["usr_name"]
|
163 |
+
# password = str(params["password"])
|
164 |
+
# host = params["host"]
|
165 |
+
# virtual_host = params["virtual_host"]
|
166 |
+
# queue_name = params["queue_name"]
|
167 |
+
|
168 |
+
while True:
|
169 |
+
try:
|
170 |
+
credentials = pika.PlainCredentials(usr_name, password)
|
171 |
+
connection = pika.BlockingConnection(
|
172 |
+
pika.ConnectionParameters(host=host, virtual_host=virtual_host, credentials=credentials, heartbeat=3600, blocked_connection_timeout=3600))
|
173 |
+
channel = connection.channel()
|
174 |
+
channel.queue_declare(queue=queue_name, durable=True, arguments={"x-max-priority": 10})
|
175 |
+
print(" * wait message")
|
176 |
+
channel.basic_qos(prefetch_count=1)
|
177 |
+
channel.basic_consume(queue=queue_name, on_message_callback=callback_func)
|
178 |
+
channel.start_consuming()
|
179 |
+
except Exception as ex:
|
180 |
+
print(f'[ERROR] ', ex)
|
181 |
+
# raise ex
|
consumer_clustering_mnews.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pika
|
2 |
+
import os
|
3 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
|
4 |
+
|
5 |
+
import json
|
6 |
+
import time
|
7 |
+
# from get_config import config_params
|
8 |
+
from config import get_config
|
9 |
+
from function import topic_clustering_mnews as tc
|
10 |
+
import requests
|
11 |
+
|
12 |
+
config_params = get_config()
|
13 |
+
ConfigManager = config_params['ConfigManager']
|
14 |
+
|
15 |
+
last_time_check = time.time()
|
16 |
+
def update_result(result, type='daily', meta = {}):
|
17 |
+
command_id = meta["command_id"]
|
18 |
+
|
19 |
+
output = {
|
20 |
+
"id": command_id,
|
21 |
+
"status": 2,
|
22 |
+
"json_result": json.dumps(result)
|
23 |
+
}
|
24 |
+
|
25 |
+
url = "https://staging.pontusinc.com/api/news_management/v1/quick_search/update-command-result"
|
26 |
+
# with open("/home/vietle/topic-clustering/config/save_.json", 'w') as f:
|
27 |
+
# json.dump(output, f,ensure_ascii=False)
|
28 |
+
res = requests.post(url, json = output)
|
29 |
+
print(res.text)
|
30 |
+
print('Update result !!!!!!!!!')
|
31 |
+
|
32 |
+
def callback_func(ch, method, properties, body):
|
33 |
+
print("receive done: ")
|
34 |
+
starttime = time.time()
|
35 |
+
body = json.loads(body.decode("utf-8"))
|
36 |
+
|
37 |
+
# with open('input_daily.json','w') as f:
|
38 |
+
# json.dump(body, f, ensure_ascii=False)
|
39 |
+
|
40 |
+
docs = body['docs']
|
41 |
+
# threshold = body['threshold']
|
42 |
+
command_id = body.get("command_id",0)
|
43 |
+
print(command_id)
|
44 |
+
threshold = 0.1
|
45 |
+
top_cluster = body['top_cluster']
|
46 |
+
top_sentence = body['top_sentence']
|
47 |
+
topn_summary = body['topn_summary']
|
48 |
+
hash_str = body['hash_str']
|
49 |
+
st_time = body['st_time']
|
50 |
+
meta = body.get('meta',{})
|
51 |
+
meta["command_id"] = command_id
|
52 |
+
|
53 |
+
is_cache = False
|
54 |
+
try:
|
55 |
+
with open("log_mnews/log/log.txt") as f:
|
56 |
+
data_dict = json.load(f)
|
57 |
+
except Exception as ve:
|
58 |
+
print(ve)
|
59 |
+
data_dict = {}
|
60 |
+
|
61 |
+
try:
|
62 |
+
if hash_str in data_dict:
|
63 |
+
path_res = data_dict[hash_str]["response_path"]
|
64 |
+
with open(path_res) as ff:
|
65 |
+
results = json.load(ff)
|
66 |
+
print("time analysis (cache): ", time.time() - st_time)
|
67 |
+
update_result(results,meta=meta)
|
68 |
+
is_cache = True
|
69 |
+
except Exception as vee:
|
70 |
+
print(vee)
|
71 |
+
|
72 |
+
if not is_cache:
|
73 |
+
results = tc.topic_clustering(docs, threshold, top_cluster=top_cluster, top_sentence=top_sentence,
|
74 |
+
topn_summary=topn_summary)
|
75 |
+
update_result(results, meta=meta)
|
76 |
+
|
77 |
+
path_res = "log_mnews/result_{0}.txt".format(hash_str)
|
78 |
+
with open(path_res, "w+") as ff:
|
79 |
+
ff.write(json.dumps(results))
|
80 |
+
|
81 |
+
data_dict[hash_str] = {"time": st_time, "response_path": path_res}
|
82 |
+
|
83 |
+
lst_rm = []
|
84 |
+
global last_time_check
|
85 |
+
if time.time() - last_time_check > 3600:
|
86 |
+
print("check log to del .....")
|
87 |
+
last_time_check = time.time()
|
88 |
+
for dt in data_dict:
|
89 |
+
if time.time() - data_dict[dt]["time"] > 30 * 24 * 3600:
|
90 |
+
lst_rm.append(dt)
|
91 |
+
for dt in lst_rm:
|
92 |
+
del data_dict[dt]
|
93 |
+
with open("log_mnews/log/log.txt", "w+") as ff:
|
94 |
+
ff.write(json.dumps(data_dict))
|
95 |
+
print("time analysis: ", time.time() - starttime)
|
96 |
+
ch.basic_ack(delivery_tag=method.delivery_tag)
|
97 |
+
|
98 |
+
|
99 |
+
def test():
|
100 |
+
with open('req_daily/aus.json') as f:
|
101 |
+
body = json.load(f)
|
102 |
+
|
103 |
+
docs = body['response']['docs']
|
104 |
+
# threshold = body['threshold']
|
105 |
+
threshold = 0.25
|
106 |
+
top_cluster = body['top_cluster']
|
107 |
+
top_sentence = body['top_sentence']
|
108 |
+
topn_summary = body['topn_summary']
|
109 |
+
# hash_str = body['hash_str']
|
110 |
+
# st_time = body['st_time']
|
111 |
+
meta = body['response'].get('meta',{})
|
112 |
+
results = tc.topic_clustering(docs, threshold, top_cluster=top_cluster, top_sentence=top_sentence,
|
113 |
+
topn_summary=topn_summary)
|
114 |
+
update_result(results, meta=meta)
|
115 |
+
print(123)
|
116 |
+
if __name__ == '__main__':
|
117 |
+
params = ConfigManager['QueueConfigs']['queue_topic_clustering_mnews']
|
118 |
+
usr_name = params["UserName"]
|
119 |
+
password = str(params["Password"])
|
120 |
+
host = params["HostName"]
|
121 |
+
virtual_host = params["VirtualHost"]
|
122 |
+
queue_name = params["Queue"]
|
123 |
+
|
124 |
+
while True:
|
125 |
+
try:
|
126 |
+
credentials = pika.PlainCredentials(usr_name, password)
|
127 |
+
connection = pika.BlockingConnection(
|
128 |
+
pika.ConnectionParameters(host=host, virtual_host=virtual_host, credentials=credentials, heartbeat=3600, blocked_connection_timeout=3600))
|
129 |
+
channel = connection.channel()
|
130 |
+
channel.queue_declare(queue=queue_name, durable=True, arguments={"x-max-priority": 10})
|
131 |
+
print(" * wait message")
|
132 |
+
channel.basic_qos(prefetch_count=1)
|
133 |
+
channel.basic_consume(queue=queue_name, on_message_callback=callback_func)
|
134 |
+
channel.start_consuming()
|
135 |
+
except Exception as ex:
|
136 |
+
print(f'[ERROR] ', ex)
|
137 |
+
# raise ex
|
consumer_hot_topic_ondemand.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI
|
2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
3 |
+
from entity import InputHotTopic, ResponseQueue
|
4 |
+
from threading import Thread
|
5 |
+
from queue import Queue
|
6 |
+
import time
|
7 |
+
import json, requests
|
8 |
+
from service_cache_Thien import get_data_sorl
|
9 |
+
import os
|
10 |
+
from datetime import datetime
|
11 |
+
from email_validator import push_msg_tele
|
12 |
+
|
13 |
+
import time
|
14 |
+
import json
|
15 |
+
import hashlib
|
16 |
+
|
17 |
+
|
18 |
+
from pydantic import BaseModel
|
19 |
+
|
20 |
+
class InputHotTopic(BaseModel):
|
21 |
+
start_time: str = "2024-09-03 23:00:00"
|
22 |
+
end_time: str = "2024-09-05 23:00:00"
|
23 |
+
query: str = "Giá nhà chung cư trên Hà Nội"
|
24 |
+
keywords: list = ["chung cư, Hà Nội", "Hoà Lạc"]
|
25 |
+
top_cluster: int = 5
|
26 |
+
prompt: str = """Trong 300 từ, hãy tổng hợp thành một đoạn văn một cách đầy đủ, chi tiết, và trung thực về các chủ đề xung quanh biến động giá nhà chung cư Hà Nội từ nội dung dưới đây.
|
27 |
+
Nếu không có thông tin gì liên quan đến giá nhà chung cư Hà Nội trong nội dung cung cấp thì trả lời "không có thông tin". Không đưa quan điểm cá nhân, không lặp lại một phần câu hỏi, loại bỏ phần mở đầu. Không có những câu từ liên kết như: "Sau đây là nội dung tóm tắt", "Nội dung tóm tắt là", "Dưới đây là " ... """
|
28 |
+
check_relevent: str = "Hãy đánh giá nội dung dưới đây có thông tin liên quan đến giá cả nhà chung cư Hà Nội hay không? Chỉ trả lời có hoặc không, không đưa thêm thông tin không liên quan"
|
29 |
+
max_posts: int = 5000
|
30 |
+
|
31 |
+
def get_hash_id(item: InputHotTopic):
|
32 |
+
str_hash = ""
|
33 |
+
if item.id_topic:
|
34 |
+
str_hash += item.id_topic
|
35 |
+
str_hash += item.start_time
|
36 |
+
return hashlib.sha224(str_hash.encode("utf-8")).hexdigest()
|
37 |
+
else:
|
38 |
+
return ""
|
39 |
+
|
40 |
+
class SessionProcess(object):
|
41 |
+
|
42 |
+
def __init__(self):
|
43 |
+
self.session = dict()
|
44 |
+
|
45 |
+
def hash_session(self, query: InputHotTopic):
|
46 |
+
hash_dict = query.dict()
|
47 |
+
hash_dict['time'] = int(time.time())
|
48 |
+
return hashlib.sha224(json.dumps(hash_dict).encode("utf-8")).hexdigest()
|
49 |
+
|
50 |
+
def insert_session(self, data_input):
|
51 |
+
print('data_input: ', data_input)
|
52 |
+
# if self.mode == "command_center":
|
53 |
+
# hash_id = hash_session(data_input)
|
54 |
+
# else:
|
55 |
+
hash_id = self.hash_session(data_input)
|
56 |
+
if hash_id not in self.session:
|
57 |
+
self.session[hash_id] = {"status": 0, "created_time": time.time(), "update_time": time.time(),
|
58 |
+
"result": {}, "data": data_input}
|
59 |
+
return hash_id
|
60 |
+
|
61 |
+
def get_info_session(self, hash_id: str):
|
62 |
+
if hash_id in self.session:
|
63 |
+
return self.session[hash_id]
|
64 |
+
return {"status": -2, "result": {}, "meta": {}}
|
65 |
+
|
66 |
+
def update_session(self, hash_id: str, result: dict, status: int):
|
67 |
+
if hash_id in self.session:
|
68 |
+
self.session[hash_id]["status"] = status
|
69 |
+
self.session[hash_id]["result"] = result
|
70 |
+
self.session[hash_id]["update_time"] = time.time()
|
71 |
+
return True
|
72 |
+
return False
|
73 |
+
|
74 |
+
def delete_session(self, hash_id: str):
|
75 |
+
if hash_id in self.session:
|
76 |
+
del self.session[hash_id]
|
77 |
+
return True
|
78 |
+
return False
|
79 |
+
|
80 |
+
SESSION = SessionProcess()
|
81 |
+
app = FastAPI(title="Hot Topic")
|
82 |
+
app.add_middleware(
|
83 |
+
CORSMiddleware,
|
84 |
+
allow_origins=["*"],
|
85 |
+
allow_credentials=True,
|
86 |
+
allow_methods=["*"],
|
87 |
+
allow_headers=["*"],
|
88 |
+
)
|
89 |
+
|
90 |
+
NUM_OF_THREAD = 2
|
91 |
+
QQ = Queue(maxsize=0) # don't limit queue
|
92 |
+
|
93 |
+
def process_wc():
|
94 |
+
print('Run thr')
|
95 |
+
global SESSION, QQ
|
96 |
+
while True:
|
97 |
+
if not QQ.empty():
|
98 |
+
hash_id = QQ.get()
|
99 |
+
SESSION.update_session(hash_id, {}, 0)
|
100 |
+
print("update trạng thái status = 0: đang xử lý")
|
101 |
+
try:
|
102 |
+
ss_info = SESSION.get_info_session(hash_id)
|
103 |
+
status = ss_info["status"]
|
104 |
+
print("trạng thái hiện tại: ", status)
|
105 |
+
if status == 0:
|
106 |
+
data_input = SESSION.session[hash_id]["data"]
|
107 |
+
res_doc = get_data_sorl(data_input.query, data_input.keywords, data_input.start_time, data_input.end_time, max_posts = data_input.max_posts)
|
108 |
+
print('lenght res_doc: ', len(res_doc))
|
109 |
+
if not res_doc:
|
110 |
+
SESSION.update_session(hash_id, {}, -1)
|
111 |
+
else:
|
112 |
+
# start_time: str = "2024-03-03 23:00:00"
|
113 |
+
current_time = datetime.now()
|
114 |
+
time_now = current_time.strftime("%Y-%m-%d %H:%M:%S")
|
115 |
+
d = {
|
116 |
+
"id_topic": "99999",
|
117 |
+
"start_time": time_now,
|
118 |
+
"end_time": data_input.end_time,
|
119 |
+
"threshold": 0.3,
|
120 |
+
"top_sentence": -1,
|
121 |
+
"top_cluster": data_input.top_cluster,
|
122 |
+
"topn_summary": 10,
|
123 |
+
"type_cluster": "",
|
124 |
+
"lang_process": "",
|
125 |
+
"prompt": data_input.prompt,
|
126 |
+
"topic_name": data_input.check_relevent,
|
127 |
+
"responseHeader": {},
|
128 |
+
"benchmark_topics": [],
|
129 |
+
"response": {"docs": res_doc}
|
130 |
+
}
|
131 |
+
|
132 |
+
str_hash = ""
|
133 |
+
str_hash += "99999"
|
134 |
+
str_hash += time_now
|
135 |
+
hash_id_path = hashlib.sha224(str_hash.encode("utf-8")).hexdigest()
|
136 |
+
|
137 |
+
st_time = time.time()
|
138 |
+
try:
|
139 |
+
response = requests.post('http://10.9.3.241:8636/newsanalysis/topic_clustering', json=d, timeout=5)
|
140 |
+
except:
|
141 |
+
print("Timeout done")
|
142 |
+
|
143 |
+
print("push done msg")
|
144 |
+
res_clus = {}
|
145 |
+
# flag = False
|
146 |
+
# count = 0
|
147 |
+
# while not flag and count < 18000:
|
148 |
+
# if os.path.exists("/home/vietle/topic-clustering/log/result_{0}.txt".format(hash_id_path)):
|
149 |
+
# path_res = "/home/vietle/topic-clustering/log/result_{0}.txt".format(hash_id_path)
|
150 |
+
# with open(path_res, encoding="utf-8") as ff:
|
151 |
+
# res_clus = json.load(ff)
|
152 |
+
# res_clus["num_articles"] = len(res_doc)
|
153 |
+
# message = "Hello"
|
154 |
+
# push_msg_tele(data_input.bot_token , data_input.chat_id , message)
|
155 |
+
# print('done processing result')
|
156 |
+
# flag = True
|
157 |
+
# time.sleep(1)
|
158 |
+
# count +=1
|
159 |
+
# print('sleep: ', count)
|
160 |
+
|
161 |
+
|
162 |
+
print("update done msg")
|
163 |
+
SESSION.update_session(hash_id_path, res_clus, 1)
|
164 |
+
except Exception as ve_:
|
165 |
+
print(ve_)
|
166 |
+
SESSION.update_session(hash_id_path, {}, -1)
|
167 |
+
raise ve_
|
168 |
+
else:
|
169 |
+
time.sleep(2)
|
170 |
+
|
171 |
+
|
172 |
+
for _ in range(NUM_OF_THREAD):
|
173 |
+
worker = Thread(target=process_wc, args=())
|
174 |
+
worker.setDaemon(True)
|
175 |
+
worker.start()
|
176 |
+
|
177 |
+
|
178 |
+
@app.post("/api/v1/send_message")
|
179 |
+
def send_requests(item: InputHotTopic):
|
180 |
+
global SESSION
|
181 |
+
hash_id = SESSION.insert_session(item)
|
182 |
+
if SESSION.session[hash_id]["status"] == 0:
|
183 |
+
QQ.put(hash_id)
|
184 |
+
|
185 |
+
return ResponseQueue(statusCode=1, message="Push to queue done !", result={"hash_id": hash_id})
|
186 |
+
|
187 |
+
class InputSession(BaseModel):
|
188 |
+
hash_id: str = ""
|
189 |
+
|
190 |
+
class Response(BaseModel):
|
191 |
+
statusCode: int = 200
|
192 |
+
message: str = ""
|
193 |
+
result: dict = {}
|
194 |
+
@app.post("/api/mining/qna/result")
|
195 |
+
def get_result(item: InputSession):
|
196 |
+
global SESSION
|
197 |
+
res = SESSION.get_info_session(item.hash_id)
|
198 |
+
status = res["status"]
|
199 |
+
res = res["result"]
|
200 |
+
if status == -1:
|
201 |
+
msg = "ERROR"
|
202 |
+
elif status == 0:
|
203 |
+
msg = "processing ..."
|
204 |
+
elif status == 1:
|
205 |
+
msg = "done"
|
206 |
+
# SESSION.delete_session(item.hash_id)
|
207 |
+
else:
|
208 |
+
msg = "nothing"
|
209 |
+
return Response(statusCode=status, message=msg, result=res)
|
consumer_merge_clustering.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import os
|
2 |
+
# os.environ["CUDA_VISIBLE_DEVICES"] ="-1"
|
3 |
+
import pika
|
4 |
+
import json
|
5 |
+
import time
|
6 |
+
import requests
|
7 |
+
|
8 |
+
from merge_topic import main
|
9 |
+
# from get_config import config_params
|
10 |
+
from config import get_config
|
11 |
+
|
12 |
+
config_params = get_config()
|
13 |
+
ConfigManager = config_params['ConfigManager']
|
14 |
+
|
15 |
+
def update_result(result, type='daily', meta = {}):
|
16 |
+
benchmark_children_id = -1
|
17 |
+
benchmark_id = -1
|
18 |
+
source_tagids = []
|
19 |
+
for id_cluster in result:
|
20 |
+
for doc in result[id_cluster][:1]:
|
21 |
+
source_tagids = doc.get('source_tagids',[])
|
22 |
+
for key in doc:
|
23 |
+
if "benchmark_child" in key:
|
24 |
+
benchmark_children_id = int(key.lstrip('benchmark_child_'))
|
25 |
+
if "benchmark" in key and 'child' not in key:
|
26 |
+
benchmark_id = int(key.lstrip('benchmark_'))
|
27 |
+
break
|
28 |
+
|
29 |
+
if not source_tagids:
|
30 |
+
source_tagids = []
|
31 |
+
if len(source_tagids) > 0:
|
32 |
+
benchmark_id = 0
|
33 |
+
benchmark_children_id = 0
|
34 |
+
|
35 |
+
output = {
|
36 |
+
"benchmark_id": benchmark_id,
|
37 |
+
"benchmark_children_id": benchmark_children_id,
|
38 |
+
"source_tagids": source_tagids,
|
39 |
+
"country_code": meta.get('country_code',''),
|
40 |
+
"type": type,
|
41 |
+
"data": json.dumps(result)
|
42 |
+
}
|
43 |
+
|
44 |
+
# with open('test_result.json','w') as f:
|
45 |
+
# json.dump(output, f, ensure_ascii=False)
|
46 |
+
|
47 |
+
# url = config_params['api_save_clustering']
|
48 |
+
url = ConfigManager['ApiConnects']['api_save_clustering']['BaseUrl']
|
49 |
+
|
50 |
+
res = requests.post(url, json = output)
|
51 |
+
print(res.text)
|
52 |
+
print('Update result !!!!!!!!!')
|
53 |
+
|
54 |
+
def callback_func(ch, method, properties, body):
|
55 |
+
print("receive done: ")
|
56 |
+
starttime = time.time()
|
57 |
+
body = json.loads(body.decode("utf-8"))
|
58 |
+
|
59 |
+
req = body
|
60 |
+
type = req['type']
|
61 |
+
meta = req.get('meta', {})
|
62 |
+
res = main(req)
|
63 |
+
update_result(res, type, meta=meta)
|
64 |
+
print('Time process:', time.time() - starttime)
|
65 |
+
ch.basic_ack(delivery_tag=method.delivery_tag)
|
66 |
+
|
67 |
+
|
68 |
+
if __name__ == '__main__':
|
69 |
+
params = ConfigManager['QueueConfigs']['queue_merge_clustering']
|
70 |
+
usr_name = params["UserName"]
|
71 |
+
password = str(params["Password"])
|
72 |
+
host = params["HostName"]
|
73 |
+
virtual_host = params["VirtualHost"]
|
74 |
+
queue_name = params["Queue"]
|
75 |
+
|
76 |
+
# params = config_params['queue_merge_clustering']
|
77 |
+
# usr_name = params["usr_name"]
|
78 |
+
# password = str(params["password"])
|
79 |
+
# host = params["host"]
|
80 |
+
# virtual_host = params["virtual_host"]
|
81 |
+
# queue_name = params["queue_name"]
|
82 |
+
|
83 |
+
while True:
|
84 |
+
try:
|
85 |
+
credentials = pika.PlainCredentials(usr_name, password)
|
86 |
+
connection = pika.BlockingConnection(
|
87 |
+
pika.ConnectionParameters(host=host, virtual_host=virtual_host, credentials=credentials, heartbeat=3600, blocked_connection_timeout=3600))
|
88 |
+
channel = connection.channel()
|
89 |
+
channel.queue_declare(queue=queue_name, durable=True, arguments={"x-max-priority": 10})
|
90 |
+
print(" * wait message")
|
91 |
+
channel.basic_qos(prefetch_count=1)
|
92 |
+
channel.basic_consume(queue=queue_name, on_message_callback=callback_func)
|
93 |
+
channel.start_consuming()
|
94 |
+
except Exception as ex:
|
95 |
+
print(f'[ERROR] ', ex)
|
96 |
+
# raise ex
|
consumer_merge_clustering_newscms.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import os
|
2 |
+
# os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
|
3 |
+
|
4 |
+
import pika
|
5 |
+
import json
|
6 |
+
import time
|
7 |
+
import requests
|
8 |
+
|
9 |
+
from merge_topic import main
|
10 |
+
# from get_config import config_params
|
11 |
+
from config import get_config
|
12 |
+
|
13 |
+
config_params = get_config()
|
14 |
+
ConfigManager = config_params['ConfigManager']
|
15 |
+
URL_SAVE_CLUSTERING_CMS = ConfigManager["ApiConnects"]["api_save_cluster_newscms"]["BaseUrl"]
|
16 |
+
|
17 |
+
def update_result(result, id, meta = {}):
|
18 |
+
print(result)
|
19 |
+
print("-----")
|
20 |
+
output = {
|
21 |
+
"id": id,
|
22 |
+
"result":json.dumps(result)
|
23 |
+
}
|
24 |
+
res = requests.post(url=URL_SAVE_CLUSTERING_CMS, json = output)
|
25 |
+
print(res.text)
|
26 |
+
print('Update result !!!!!!!!!')
|
27 |
+
|
28 |
+
def callback_func(ch, method, properties, body):
|
29 |
+
print("receive done: ")
|
30 |
+
starttime = time.time()
|
31 |
+
body = json.loads(body.decode("utf-8"))
|
32 |
+
with open("/home/vietle/topic-clustering/input_merge1.json", 'w') as f:
|
33 |
+
json.dump(body,f,ensure_ascii = False)
|
34 |
+
req = body
|
35 |
+
req["type"] = "monthly"
|
36 |
+
id = req["id"]
|
37 |
+
meta = req.get('meta', {})
|
38 |
+
|
39 |
+
preprocess_reformat = []
|
40 |
+
preprocess = req["preprocess"]
|
41 |
+
for daily_clusters in preprocess:
|
42 |
+
|
43 |
+
dict_cluster = {}
|
44 |
+
for i,doc in enumerate(daily_clusters["topic"]):
|
45 |
+
reps_post = doc
|
46 |
+
lst_doc = [reps_post]
|
47 |
+
lst_doc.extend(doc.get("list_posts", []))
|
48 |
+
dict_cluster[i] = lst_doc
|
49 |
+
it = {
|
50 |
+
"topic": dict_cluster
|
51 |
+
}
|
52 |
+
preprocess_reformat.append(it)
|
53 |
+
req["preprocess"] = preprocess_reformat
|
54 |
+
|
55 |
+
res = main(req)
|
56 |
+
update_result(res, id, meta=meta)
|
57 |
+
print('Time process:', time.time() - starttime)
|
58 |
+
ch.basic_ack(delivery_tag=method.delivery_tag)
|
59 |
+
|
60 |
+
|
61 |
+
def test():
|
62 |
+
with open("/home/vietle/topic-clustering/input_merge1.json", 'r') as f:
|
63 |
+
body = json.load(f)
|
64 |
+
|
65 |
+
req = body
|
66 |
+
req["type"] = "monthly"
|
67 |
+
id = req["id"]
|
68 |
+
meta = req.get('meta', {})
|
69 |
+
|
70 |
+
preprocess_reformat = []
|
71 |
+
preprocess = req["preprocess"]
|
72 |
+
for daily_clusters in preprocess:
|
73 |
+
|
74 |
+
dict_cluster = {}
|
75 |
+
for i,topic in enumerate(daily_clusters["topic"]):
|
76 |
+
dict_cluster[i] = topic
|
77 |
+
it = {
|
78 |
+
"topic": dict_cluster
|
79 |
+
}
|
80 |
+
preprocess_reformat.append(it)
|
81 |
+
req["preprocess"] = preprocess_reformat
|
82 |
+
with open("/home/vietle/topic-clustering/input_merge2.json", 'w') as f:
|
83 |
+
json.dump(req,f,ensure_ascii = False)
|
84 |
+
res = main(req)
|
85 |
+
|
86 |
+
if __name__ == '__main__':
|
87 |
+
# test()
|
88 |
+
params = ConfigManager['QueueConfigs']['queue_merge_clustering_newscms']
|
89 |
+
usr_name = params["UserName"]
|
90 |
+
password = str(params["Password"])
|
91 |
+
host = params["HostName"]
|
92 |
+
virtual_host = params["VirtualHost"]
|
93 |
+
queue_name = params["Queue"]
|
94 |
+
|
95 |
+
|
96 |
+
while True:
|
97 |
+
try:
|
98 |
+
credentials = pika.PlainCredentials(usr_name, password)
|
99 |
+
connection = pika.BlockingConnection(
|
100 |
+
pika.ConnectionParameters(host=host, virtual_host=virtual_host, credentials=credentials, heartbeat=3600, blocked_connection_timeout=3600))
|
101 |
+
channel = connection.channel()
|
102 |
+
channel.queue_declare(queue=queue_name, durable=True, arguments={"x-max-priority": 10})
|
103 |
+
print(" * wait message")
|
104 |
+
channel.basic_qos(prefetch_count=1)
|
105 |
+
channel.basic_consume(queue=queue_name, on_message_callback=callback_func)
|
106 |
+
channel.start_consuming()
|
107 |
+
except Exception as ex:
|
108 |
+
print(f'[ERROR] ', ex)
|
109 |
+
raise ex
|
deployment.yaml
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
apiVersion: apps/v1beta1
|
2 |
+
kind: Deployment
|
3 |
+
metadata:
|
4 |
+
name: clusteringcpu
|
5 |
+
labels:
|
6 |
+
app: clusteringcpu
|
7 |
+
spec:
|
8 |
+
strategy:
|
9 |
+
type: Recreate
|
10 |
+
template:
|
11 |
+
metadata:
|
12 |
+
labels:
|
13 |
+
app: clusteringcpu
|
14 |
+
tier: clusteringcpu
|
15 |
+
spec:
|
16 |
+
containers:
|
17 |
+
- image: {{registry_host}}/clusteringcpu:{{tag}}
|
18 |
+
name: clusteringcpu
|
19 |
+
resources:
|
20 |
+
requests:
|
21 |
+
cpu: 6
|
22 |
+
memory: 60000Mi
|
23 |
+
limits:
|
24 |
+
cpu: 10
|
25 |
+
memory: 100000Mi
|
26 |
+
ports:
|
27 |
+
- containerPort:
|
28 |
+
name:
|
docker/api_trt/Dockerfile
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvcr.io/nvidia/tensorrt:22.07-py3
|
2 |
+
|
3 |
+
EXPOSE 8633
|
4 |
+
WORKDIR /app
|
5 |
+
|
6 |
+
RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && pip install --upgrade pip
|
7 |
+
|
8 |
+
RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && apt-get update && \
|
9 |
+
DEBIAN_FRONTEND=noninteractive \
|
10 |
+
apt-get -y install default-jre-headless && \
|
11 |
+
apt-get clean && \
|
12 |
+
rm -rf /var/lib/apt/lists/*
|
13 |
+
|
14 |
+
COPY req.txt /app/
|
15 |
+
RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && pip install -r req.txt
|
16 |
+
RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && pip install --upgrade pip && pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
|
17 |
+
|
18 |
+
COPY ./main_cache.py /app/
|
19 |
+
|
20 |
+
|
21 |
+
RUN rm -r ~/.cache/pip/*
|
22 |
+
CMD ["python3", "main_cache.py"]
|
docker/cpu/Dockerfile
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.7
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
|
6 |
+
RUN apt-get update && apt-get install build-essential cmake git -y
|
7 |
+
|
8 |
+
#----------------JRE (for vncorenlp)--------------
|
9 |
+
RUN apt-get update && \
|
10 |
+
DEBIAN_FRONTEND=noninteractive \
|
11 |
+
apt-get -y install default-jre-headless && \
|
12 |
+
apt-get clean && \
|
13 |
+
rm -rf /var/lib/apt/lists/*
|
14 |
+
|
15 |
+
RUN apt-get install unzip
|
16 |
+
RUN pip install gdown
|
17 |
+
RUN gdown --id 1MTAPYy9AcYtfiJ6m_pz6MPeA6li8pYp7
|
18 |
+
RUN unzip vncorenlp.zip -d /app/
|
19 |
+
|
20 |
+
|
21 |
+
# COPY ./model /app/model
|
22 |
+
|
23 |
+
RUN mkdir -p /app/model
|
24 |
+
RUN mkdir -p /app/log
|
25 |
+
RUN mkdir -p /app/log_run
|
26 |
+
|
27 |
+
COPY reqs_cpu.txt /app/
|
28 |
+
RUN pip install -r reqs_cpu.txt
|
29 |
+
|
30 |
+
COPY ./load_model.py /app/
|
31 |
+
RUN python load_model.py
|
32 |
+
|
33 |
+
COPY ./config /app/config
|
34 |
+
COPY ./entity /app/entity
|
35 |
+
COPY ./function /app/function
|
36 |
+
COPY ./main_cache.py /app/
|
37 |
+
COPY ./service_cache.py /app/
|
38 |
+
COPY ./summary.py /app/
|
39 |
+
COPY ./merge_topic.py /app/
|
40 |
+
COPY ./consumer_clustering.py /app/
|
41 |
+
COPY ./consumer_merge_clustering.py /app/
|
42 |
+
COPY ./run_multi_process.py /app/
|
43 |
+
|
44 |
+
RUN rm -r ~/.cache/pip/*
|
45 |
+
|
46 |
+
CMD ["python", "run_multi_process.py"]
|
docker/gpu/Dockerfile_gpu
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.7
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
|
6 |
+
RUN apt-get update && apt-get install build-essential cmake git -y
|
7 |
+
|
8 |
+
#----------------JRE (for vncorenlp)--------------
|
9 |
+
RUN apt-get update && \
|
10 |
+
DEBIAN_FRONTEND=noninteractive \
|
11 |
+
apt-get -y install default-jre-headless && \
|
12 |
+
apt-get clean && \
|
13 |
+
rm -rf /var/lib/apt/lists/*
|
14 |
+
|
15 |
+
COPY ./model /app/model
|
16 |
+
|
17 |
+
RUN pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
|
18 |
+
RUN mkdir log
|
19 |
+
RUN mkdir log_run
|
20 |
+
|
21 |
+
COPY req.txt /app/
|
22 |
+
RUN pip install -r req.txt
|
23 |
+
|
24 |
+
COPY ./entity /app/entity
|
25 |
+
COPY ./function /app/function
|
26 |
+
COPY ./vncorenlp /app/vncorenlp
|
27 |
+
COPY ./main_cache.py /app/
|
28 |
+
COPY ./service_cache.py /app/
|
29 |
+
COPY ./summary.py /app/
|
30 |
+
|
31 |
+
RUN rm -r ~/.cache/pip/*
|
32 |
+
|
33 |
+
CMD ["python", "main_cache.py"]
|
docker_build.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
docker build -t topic_clustering .
|
2 |
+
docker run -d --restart=always -p8633:8633 --name topic_clustering topic_clustering
|
3 |
+
/////docker run -p8633:8633 --name topic_clustering topic_clustering
|
4 |
+
|
5 |
+
docker build -t clustering-dashboard .
|
6 |
+
docker run --name clustering-dashboard clustering-dashboard
|
entity/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .types import *
|
entity/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (181 Bytes). View file
|
|
entity/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (205 Bytes). View file
|
|
entity/__pycache__/types.cpython-37.pyc
ADDED
Binary file (129 kB). View file
|
|
entity/__pycache__/types.cpython-38.pyc
ADDED
Binary file (129 kB). View file
|
|
entity/types.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
function/SessionProcess.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import time
|
3 |
+
import json
|
4 |
+
import hashlib
|
5 |
+
|
6 |
+
|
7 |
+
from pydantic import BaseModel
|
8 |
+
|
9 |
+
class InputHotTopic(BaseModel):
|
10 |
+
start_time: str = ""
|
11 |
+
end_time: str = ""
|
12 |
+
query: str = ""
|
13 |
+
keywords: list = []
|
14 |
+
top_cluster: int = 5
|
15 |
+
prompt: str = ""
|
16 |
+
check_relevent: str = ""
|
17 |
+
class SessionProcess(object):
|
18 |
+
|
19 |
+
def __init__(self):
|
20 |
+
self.session = dict()
|
21 |
+
|
22 |
+
def hash_session(self, query: InputHotTopic):
|
23 |
+
hash_dict = query.dict()
|
24 |
+
hash_dict['time'] = int(time.time())
|
25 |
+
return hashlib.sha224(json.dumps(hash_dict).encode("utf-8")).hexdigest()
|
26 |
+
|
27 |
+
def insert_session(self, data_input):
|
28 |
+
print('data_input: ', data_input)
|
29 |
+
# if self.mode == "command_center":
|
30 |
+
# hash_id = hash_session(data_input)
|
31 |
+
# else:
|
32 |
+
hash_id = self.hash_session(data_input)
|
33 |
+
if hash_id not in self.session:
|
34 |
+
self.session[hash_id] = {"status": 0, "created_time": time.time(), "update_time": time.time(),
|
35 |
+
"result": {}, "data": data_input}
|
36 |
+
return hash_id
|
37 |
+
|
38 |
+
def get_info_session(self, hash_id: str):
|
39 |
+
if hash_id in self.session:
|
40 |
+
return self.session[hash_id]
|
41 |
+
return {"status": -2, "result": {}, "meta": {}}
|
42 |
+
|
43 |
+
def update_session(self, hash_id: str, result: dict, status: int):
|
44 |
+
if hash_id in self.session:
|
45 |
+
self.session[hash_id]["status"] = status
|
46 |
+
self.session[hash_id]["result"] = result
|
47 |
+
self.session[hash_id]["update_time"] = time.time()
|
48 |
+
return True
|
49 |
+
return False
|
50 |
+
|
51 |
+
def delete_session(self, hash_id: str):
|
52 |
+
if hash_id in self.session:
|
53 |
+
del self.session[hash_id]
|
54 |
+
return True
|
55 |
+
return False
|
function/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# from . import topic_clustering_v2
|
2 |
+
# from . import sentence_embedding
|
3 |
+
from SessionProcess import SessionProcess
|
function/__pycache__/SessionProcess.cpython-37.pyc
ADDED
Binary file (1.84 kB). View file
|
|
function/__pycache__/SessionProcess.cpython-38.pyc
ADDED
Binary file (1.88 kB). View file
|
|
function/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (161 Bytes). View file
|
|
function/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (220 Bytes). View file
|
|
function/__pycache__/tc_v2.cpython-37.pyc
ADDED
Binary file (15.9 kB). View file
|
|
function/__pycache__/tc_v2.cpython-38.pyc
ADDED
Binary file (15.4 kB). View file
|
|
function/clean_text.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
def normalize_text(text):
|
4 |
+
# text = text.lower()
|
5 |
+
text = text.replace('🏻', '')
|
6 |
+
full_text_clean = text
|
7 |
+
full_text_clean.replace(");this.closest('table').remove();","")
|
8 |
+
full_text_clean = re.sub('(Thứ .{2,4}|Chủ nhật),( ngày)? \d{1,2}\/\d{1,2}\/\d{4}( \d{1,2}:\d{1,2})?( AM| PM)?( \(GMT.{1,3}\))?','',full_text_clean)
|
9 |
+
if not any([word in full_text_clean[:15].lower() for word in ['nga','covid']]):
|
10 |
+
full_text_clean = re.sub('^.{1,15}?-','',full_text_clean)
|
11 |
+
# full_text_clean = re.sub('-.{1,15}?$','',full_text_clean)
|
12 |
+
|
13 |
+
full_text_clean = re.sub('- VNExplorer','',full_text_clean)
|
14 |
+
full_text_clean = re.sub('Theo .{1,20}$','',full_text_clean)
|
15 |
+
full_text_clean = re.sub('\(.*(Ảnh|Nguồn).*?\)','',full_text_clean)
|
16 |
+
full_text_clean = re.sub('\d{1,2} (giờ|phút) trước','',full_text_clean)
|
17 |
+
full_text_clean = re.sub(r"http\S+", "", full_text_clean)
|
18 |
+
# full_text_clean = re.sub('(\\r)*( )*(\\n)*( )*(\\r)*( )*(\\n)','.', full_text_clean)
|
19 |
+
full_text_clean = re.sub(r"\.( )*(\.)+", '. ', full_text_clean)
|
20 |
+
full_text_clean = re.sub('\.(?!\d)', '. ', full_text_clean)
|
21 |
+
full_text_clean = re.sub('(\.(\s)+)+', '. ', full_text_clean)
|
22 |
+
full_text_clean = re.sub('<[^<]+?>', '',full_text_clean)
|
23 |
+
full_text_clean = re.sub('\d{1,2}:\d{2}( )?\d{1,2}\/\d{1,2}\/\d{4}','',full_text_clean)
|
24 |
+
full_text_clean = re.sub("Ảnh(:)?(Getty)?","", full_text_clean)
|
25 |
+
full_text_clean = full_text_clean.replace("Read more about:","").replace("Read more","").replace("Advertising","").replace("bookmark border.","").replace('the latest tech news, global tech news daily, tech news today, startups, usa tech, asia tech, china tech, eu tech, global tech, in-depth electronics reviews, 24h tech news, 24h tech news, top mobile apps, tech news daily, gaming hardware, big tech news, useful technology tips, expert interviews, reporting on the business of technology, venture capital funding, programing language',"").replace('Live updates:','').replace('-VNExplorer','').replace('Reuters:','').replace('AFP:','').replace('�','').replace('- VNExplorer','').replace('Global Tech News Daily','').replace('AFP/TTXVN','').replace('Reuters/TTXVN','').replace('Tin quốc tế','').replace('Xem tiện ích cảnh báo giá CK','')
|
26 |
+
full_text_clean = full_text_clean.replace("COVID","Covid")
|
27 |
+
full_text_clean = re.sub('[A-Z ]{2,10}(,.{6,20})?(—|–|-|-)','',full_text_clean) #NEW YORK, Feb 27 — .... /BRUSSELS—...
|
28 |
+
full_text_clean = re.sub('\(ảnh:.*?\)','.',full_text_clean)
|
29 |
+
full_text_clean = re.sub("(\| )?(\(.{1,7}\)( )?)+$", "", full_text_clean)
|
30 |
+
full_text_clean = re.sub('\d{2} [\w]{3,4}, \d{4}. \d{2}.\d{2} (AM|PM) IST','',full_text_clean) #02 Mar, 2022, 10.01 AM IST
|
31 |
+
full_text_clean = full_text_clean.replace('Suzuka. config. supports_premium_subscription && window. localStorage. getItem ( "premiumSubscription ")) ) {var e = document. createElement ( "script "); e. setAttribute ( "class ", "titan-conditional "); e. setAttribute ( "data-ad-id ", "adspot-300x250-pos1 "); document. body. appendChild (e);}','')
|
32 |
+
|
33 |
+
full_text_clean = re.sub('\d{2}\/\d{2}\/\d{4} \d{2}:\d{2} GMT(\+|-)\d{1,2}', "", full_text_clean)
|
34 |
+
full_text_clean = re.sub('\(.{1,20}\)', '', full_text_clean)
|
35 |
+
full_text_clean = re.sub('\{.{1,20}\}', '', full_text_clean)
|
36 |
+
full_text_clean = re.sub('\[.{1,20}\]', '', full_text_clean)
|
37 |
+
full_text_clean = re.sub('[A-Z].{5,10} , \d{2}:\d{2} (GMT(\+|-)\d{1,2})?',"",full_text_clean)
|
38 |
+
|
39 |
+
full_text_clean = re.sub('(Theo|theo) .{1,15},', '', full_text_clean)
|
40 |
+
full_text_clean = re.sub('(Theo|theo) .{1,15},', '', full_text_clean)
|
41 |
+
full_text_clean = re.sub('theo.{3,20}$','', full_text_clean)
|
42 |
+
full_text_clean = re.sub('^Trong khi đó','', full_text_clean)
|
43 |
+
|
44 |
+
full_text_clean = re.sub('^\d{1,10} minutes ago', '', full_text_clean)
|
45 |
+
full_text_clean = re.sub('^\d{1,10} hours ago', '', full_text_clean)
|
46 |
+
full_text_clean = re.sub('^\d{1,10} days ago', '', full_text_clean)
|
47 |
+
full_text_clean = re.sub('^\d{1,10} years ago', '', full_text_clean)
|
48 |
+
full_text_clean = re.sub('^\d{1,10} months ago', '', full_text_clean)
|
49 |
+
full_text_clean = re.sub('^\d{1,10} minute ago', '', full_text_clean)
|
50 |
+
full_text_clean = re.sub('^\d{1,10} day ago', '', full_text_clean)
|
51 |
+
full_text_clean = re.sub('^\d{1,10} year ago', '', full_text_clean)
|
52 |
+
full_text_clean = re.sub('^\d{1,10} month ago', '', full_text_clean)
|
53 |
+
full_text_clean = re.sub('^\d{1,10} hour ago', '', full_text_clean)
|
54 |
+
full_text_clean = re.sub('^(a|an) minute ago', '', full_text_clean)
|
55 |
+
full_text_clean = re.sub('^(a|an) hour ago', '', full_text_clean)
|
56 |
+
full_text_clean = re.sub('^(a|an) day ago', '', full_text_clean)
|
57 |
+
full_text_clean = re.sub('^(a|an) month ago', '', full_text_clean)
|
58 |
+
full_text_clean = re.sub('^(a|an) year ago', '', full_text_clean)
|
59 |
+
full_text_clean = re.sub('^.{0,12}(tờ|theo|nguồn|trích dẫn|trang|báo|hãng).{1,30}(đưa tin|dẫn l��i|trích dẫn|nhận định|cho biết)', '', full_text_clean, flags=re.I)
|
60 |
+
|
61 |
+
text = re.sub('\s+',' ',full_text_clean)
|
62 |
+
text = re.sub('Đọc chi tiết bài viết tại đây.*','',text,flags=re.I)
|
63 |
+
# text = re.sub('[(\d)(\:)(\|)(\/)(\s+)]+','',text) # 10:20 | 09/03/2022
|
64 |
+
|
65 |
+
text = re.sub('(\d{1,2}:\d{2}( )*)\|( )*\d{1,2}(/|-)\d{2}(/|-)\d{4}','',text)
|
66 |
+
text = re.sub('^(\d)+[\,\.]\s+ ','',text) # 3, phát ngôn viên Bộ T
|
67 |
+
text = re.sub('((chủ nhật)|(thứ bảy)|(thử sáu)|(thứ năm)|(thứ tư)|(thứ ba)|(thứ hai))([(\d)(\:)(,)(\|\/)(\s+)]+)((VOV)|(VTV))$','',text,flags=re.I) # và Ukraine để giải quyết xung đột Chủ Nhật, 06:32, 20/03/2022 VOV.
|
68 |
+
|
69 |
+
text = re.sub('^((\d)|(\:)|(\.)|(\|)|(\s+)|(in bài biết)|(in bài viết)|(\/))+ ','',text,flags=re.I) # 10:20 | 09/03/2022 In bài biết. 10:20 | 09/03/2022 In bài biết Việc xuất khẩu tôm sang thị trường Nga có thể bị ảnh hưởng trước tình hình căng thẳng của Nga-Ukraine. Hiệp hội Chế biến và Xuất khẩu thuỷ sản V
|
70 |
+
text = re.sub('theo hãng tin [A-Z].{0,15},','', text, flags=re.I)
|
71 |
+
text = re.sub('((Theo.{0,30})|(Reuters)).*?(link gốc).*?$','',text,flags=re.I)
|
72 |
+
text = re.sub('video:RIA Novosti/Bộ Quốc phòng Nga','',text,flags=re.I)
|
73 |
+
text = re.sub('Báo.{5,20}$','',text)
|
74 |
+
text = re.sub('RIA Novosti/Bộ Quốc phòng Nga','',text)
|
75 |
+
text = re.sub('(chính trị|quân sự|đối ngoại|thời sự|khoa học|pháp luật) \d{1,2} giờ','',text,flags=re.I)
|
76 |
+
text = text.replace('|','')
|
77 |
+
full_text_clean = re.sub('^.*?(Link nguồn)','',text,flags=re.I) # (
|
78 |
+
full_text_clean = re.sub(',( )*[A-z].{1,30}(đưa tin|trích dẫn)','', full_text_clean)
|
79 |
+
full_text_clean = re.sub('(Reuters|Vnexpress)(\).)?','',full_text_clean,flags=re.I)
|
80 |
+
full_text_clean = re.sub('^VOV.','',full_text_clean)
|
81 |
+
full_text_clean = full_text_clean.replace("Many Japanese worry Fortune-Takashi Nakamichi, Bloomberg • 1d","").replace('baotintuc. vn',"").replace('YÊN BÁI QUYẾT TÂM THỰC HIỆN THẮNG LỢI CHƯƠNG TRÌNH HÀNH ĐỘNG SỐ 56 – CTr/TU CỦA TỈNH ỦY QUYẾT TÂM ĐƯA Y.ÊN BÁI PHÁT TRIỂN NHANH, BỀN VỮNG THEO HƯỚNG “XANH, HÀI HÒA, BẢN SẮC VÀ HẠNH PHÚC” TRỞ THÀNH TỈNH PHÁT TRIỂN KHÁ VÀO NĂM 2025','')
|
82 |
+
full_text_clean = full_text_clean.replace("Baoquocte","").replace('ชั่วโมงที่ผ่านมา.','').replace('""challenge=" "coron= ""corona=" "covid-19= ""designs=" "endgame= ""tutorial=" "ui= ""pandemic=" "quarantine= ""list=" "similarity= " "ux. ""press=" "copyright= ""contact=" "creators= ""advertise=" "terms= ""privacy=" "policy= ""safety=" "youtube= ""works=" "test= ""features=" "creators.', '').replace('nbsp & nbsp & nbsp & nbsp & nbsp Copy Link', '').replace('Tổng thống Nga Vladimir Putin và Chủ tịch Trung Quốc Tập Cận Bình.','').replace('Thế giới Toàn cảnh Bảo Hà ','')
|
83 |
+
full_text_clean = re.sub('(a|p)\. m\.','',full_text_clean)
|
84 |
+
return full_text_clean
|
function/detect_time.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
import datetime
|
4 |
+
import operator
|
5 |
+
from typing import *
|
6 |
+
from dateutil.relativedelta import *
|
7 |
+
from itertools import groupby
|
8 |
+
from dateparser import parse
|
9 |
+
|
10 |
+
day = '[0-3]{0,1}[0-9]'
|
11 |
+
month = '[0,1]{0,1}[0-9]'
|
12 |
+
year = '\d{4}'
|
13 |
+
sep = '\s*[-/\.\s]\s*'
|
14 |
+
|
15 |
+
patterns = [
|
16 |
+
f"{day}{sep}{month}{sep}{year}",
|
17 |
+
f"{year}{sep}{month}{sep}{day}",
|
18 |
+
f"{day}\s+tháng\s+{month}",
|
19 |
+
f"{day}\s+tháng\s+{month}{sep}{year}",
|
20 |
+
f"{day}\s+tháng\s+{month}\s+năm\s+{year}",
|
21 |
+
f"{day}\s+tháng\s+{month}",
|
22 |
+
f"(?<=ngày)\s+{day}{sep}{month}",
|
23 |
+
f"(?<=ngày)\s+{day}{sep}{month}{sep}{year}",
|
24 |
+
f"(?<=sáng)\s+{day}{sep}{month}",
|
25 |
+
f"(?<=sáng)\s+{day}{sep}{month}{sep}{year}",
|
26 |
+
f"(?<=trưa)\s+{day}{sep}{month}",
|
27 |
+
f"(?<=trưa)\s+{day}{sep}{month}{sep}{year}",
|
28 |
+
f"(?<=chiều)\s+{day}{sep}{month}",
|
29 |
+
f"(?<=chiều)\s+{day}{sep}{month}{sep}{year}",
|
30 |
+
f"(?<=tối)\s+{day}{sep}{month}",
|
31 |
+
f"(?<=tối)\s+{day}{sep}{month}{sep}{year}"
|
32 |
+
f"(?<=đêm)\s+{day}{sep}{month}",
|
33 |
+
f"(?<=đêm)\s+{day}{sep}{month}{sep}{year}",
|
34 |
+
f"(?<=hôm)\s+{day}{sep}{month}",
|
35 |
+
f"(?<=hôm)\s+{day}{sep}{month}{sep}{year}",
|
36 |
+
f"{day}{sep}{month}[\s\.\,\)]"
|
37 |
+
]
|
38 |
+
|
39 |
+
|
40 |
+
def extract_pattern(text: str, patterns: List[str]):
|
41 |
+
detected = []
|
42 |
+
for pattern in patterns:
|
43 |
+
for match in re.finditer(pattern, text):
|
44 |
+
detected.append((match.start(), match.end()))
|
45 |
+
detected.sort()
|
46 |
+
output = []
|
47 |
+
curr = -1
|
48 |
+
for start, values in groupby(detected, key=operator.itemgetter(0)):
|
49 |
+
if start < curr:
|
50 |
+
continue
|
51 |
+
values = list(values)
|
52 |
+
values.sort(key=operator.itemgetter(1), reverse=True)
|
53 |
+
output.append(values[0])
|
54 |
+
curr = values[0][1]
|
55 |
+
return output
|
56 |
+
|
57 |
+
|
58 |
+
def detect_time(text: str, language: str = 'vi', base: Optional[datetime.datetime] = None):
|
59 |
+
text = text.lower()
|
60 |
+
detected_patterns = extract_pattern(text, patterns)
|
61 |
+
output = []
|
62 |
+
settings = {
|
63 |
+
'PREFER_DAY_OF_MONTH': 'first'
|
64 |
+
}
|
65 |
+
if base:
|
66 |
+
settings['RELATIVE_BASE'] = base
|
67 |
+
for start, end in detected_patterns:
|
68 |
+
segment = text[start:end]
|
69 |
+
segment = re.sub('\s+', ' ', segment).strip().lower()
|
70 |
+
candiate = parse(segment, languages=[language], settings=settings)
|
71 |
+
output.append((segment, candiate))
|
72 |
+
return output
|
73 |
+
|
74 |
+
|
75 |
+
def get_time_post(sentences, patterns, start_time=None, end_time=None):
|
76 |
+
dict_time_evs = {}
|
77 |
+
for i, sen in enumerate(sentences):
|
78 |
+
if sen.strip() != "":
|
79 |
+
time_ex = detect_time(sen, patterns)
|
80 |
+
for te in time_ex:
|
81 |
+
if te[1] is not None:
|
82 |
+
if start_time is None or end_time is None or (end_time > te[1].timestamp() > start_time):
|
83 |
+
if te not in dict_time_evs:
|
84 |
+
dict_time_evs[te] = []
|
85 |
+
dict_time_evs[te].append(i)
|
86 |
+
return dict_time_evs
|
87 |
+
|
88 |
+
|
89 |
+
if __name__ == '__main__':
|
90 |
+
print(detect_time("VietTimes – Ngoại trưởng Mỹ Antony Blinken ngày đã tuyên bố trong một cuộc họp qua"
|
91 |
+
"truyền hình với ngoại trưởng các nước ASEAN Mỹ bác bỏ các yêu sách “bất hợp pháp” của"
|
92 |
+
"Trung Quốc ở Biển Đông.", language="vi"))
|
function/embed_vncorenlp.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import jnius_config
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
|
5 |
+
save_dir = "/home2/vietle/icgpt/vncorenlp-1.2"
|
6 |
+
max_heap_size='-Xmx4g'
|
7 |
+
jnius_config.add_options(max_heap_size)
|
8 |
+
jnius_config.set_classpath(save_dir + "/VnCoreNLP-1.2.jar")
|
9 |
+
|
10 |
+
def download_model(save_dir='./'):
|
11 |
+
# current_path = os.path.abspath(os.getcwd())
|
12 |
+
if save_dir[-1] == '/':
|
13 |
+
save_dir = save_dir[:-1]
|
14 |
+
if os.path.isdir(save_dir + "/models") and os.path.exists(save_dir + '/VnCoreNLP-1.2.jar'):
|
15 |
+
print("VnCoreNLP model folder " + save_dir + " already exists! Please load VnCoreNLP from this folder!")
|
16 |
+
else:
|
17 |
+
os.mkdir(save_dir + "/models")
|
18 |
+
os.mkdir(save_dir + "/models/dep")
|
19 |
+
os.mkdir(save_dir + "/models/ner")
|
20 |
+
os.mkdir(save_dir + "/models/postagger")
|
21 |
+
os.mkdir(save_dir + "/models/wordsegmenter")
|
22 |
+
# jar
|
23 |
+
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.2.jar")
|
24 |
+
shutil.move("VnCoreNLP-1.2.jar", save_dir + "/VnCoreNLP-1.2.jar")
|
25 |
+
# wordsegmenter
|
26 |
+
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab")
|
27 |
+
os.system(
|
28 |
+
"wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr")
|
29 |
+
shutil.move("vi-vocab", save_dir + "/models/wordsegmenter/vi-vocab")
|
30 |
+
shutil.move("wordsegmenter.rdr", save_dir + "/models/wordsegmenter/wordsegmenter.rdr")
|
31 |
+
# postagger
|
32 |
+
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/postagger/vi-tagger")
|
33 |
+
shutil.move("vi-tagger", save_dir + "/models/postagger/vi-tagger")
|
34 |
+
# ner
|
35 |
+
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-500brownclusters.xz")
|
36 |
+
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-ner.xz")
|
37 |
+
os.system(
|
38 |
+
"wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-pretrainedembeddings.xz")
|
39 |
+
shutil.move("vi-500brownclusters.xz", save_dir + "/models/ner/vi-500brownclusters.xz")
|
40 |
+
shutil.move("vi-ner.xz", save_dir + "/models/ner/vi-ner.xz")
|
41 |
+
shutil.move("vi-pretrainedembeddings.xz", save_dir + "/models/ner/vi-pretrainedembeddings.xz")
|
42 |
+
# parse
|
43 |
+
os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/dep/vi-dep.xz")
|
44 |
+
shutil.move("vi-dep.xz", save_dir + "/models/dep/vi-dep.xz")
|
45 |
+
|
46 |
+
|
47 |
+
class VnCoreNLP:
|
48 |
+
def __init__(self, annotators=["wseg", "pos", "ner", "parse"], save_dir = './'):
|
49 |
+
if save_dir[-1] == '/':
|
50 |
+
save_dir = save_dir[:-1]
|
51 |
+
if os.path.isdir(save_dir + "/models") == False or os.path.exists(save_dir + '/VnCoreNLP-1.2.jar') == False:
|
52 |
+
raise Exception("Please download the VnCoreNLP model!")
|
53 |
+
self.current_working_dir = os.getcwd()
|
54 |
+
os.chdir(save_dir)
|
55 |
+
|
56 |
+
from jnius import autoclass
|
57 |
+
javaclass_vncorenlp = autoclass('vn.pipeline.VnCoreNLP')
|
58 |
+
self.javaclass_String = autoclass('java.lang.String')
|
59 |
+
self.annotators = annotators
|
60 |
+
if "wseg" not in annotators:
|
61 |
+
self.annotators.append("wseg")
|
62 |
+
|
63 |
+
self.model = javaclass_vncorenlp(annotators)
|
64 |
+
|
65 |
+
def annotate_text(self, text):
|
66 |
+
from jnius import autoclass
|
67 |
+
javaclass_Annotation = autoclass('vn.pipeline.Annotation')
|
68 |
+
str = self.javaclass_String(text)
|
69 |
+
annotation = javaclass_Annotation(str)
|
70 |
+
self.model.annotate(annotation)
|
71 |
+
dict_sentences = {}
|
72 |
+
list_sentences = annotation.toString().split("\n\n")[:-1]
|
73 |
+
for i in range(len(list_sentences)):
|
74 |
+
list_words = list_sentences[i].split("\n")
|
75 |
+
list_dict_words = []
|
76 |
+
for word in list_words:
|
77 |
+
dict_word = {}
|
78 |
+
word = word.replace("\t\t", "\t")
|
79 |
+
list_tags = word.split("\t")
|
80 |
+
dict_word["index"] = int(list_tags[0])
|
81 |
+
dict_word["wordForm"] = list_tags[1]
|
82 |
+
dict_word["posTag"] = list_tags[2]
|
83 |
+
dict_word["nerLabel"] = list_tags[3]
|
84 |
+
if "parse" in self.annotators:
|
85 |
+
dict_word["head"] = int(list_tags[4])
|
86 |
+
else:
|
87 |
+
dict_word["head"] = list_tags[4]
|
88 |
+
dict_word["depLabel"] = list_tags[5]
|
89 |
+
list_dict_words.append(dict_word)
|
90 |
+
dict_sentences[i] = list_dict_words
|
91 |
+
return dict_sentences
|
92 |
+
|
93 |
+
def tokenize(self, text):
|
94 |
+
annotated_sens = self.annotate_text(text=text)
|
95 |
+
output = []
|
96 |
+
for id_sen in annotated_sens:
|
97 |
+
annotated_sen = annotated_sens[id_sen]
|
98 |
+
out = [s["wordForm"] for s in annotated_sen]
|
99 |
+
output.append(out)
|
100 |
+
return output
|
101 |
+
|
102 |
+
def pos_tag(self, text):
|
103 |
+
annotated_sens = self.annotate_text(text=text)
|
104 |
+
output = []
|
105 |
+
for id_sen in annotated_sens:
|
106 |
+
annotated_sen = annotated_sens[id_sen]
|
107 |
+
out = [(s["wordForm"], s["posTag"]) for s in annotated_sen]
|
108 |
+
output.append(out)
|
109 |
+
return output
|
110 |
+
|
111 |
+
def ner(self, text):
|
112 |
+
annotated_sens = self.annotate_text(text=text)
|
113 |
+
output = []
|
114 |
+
for id_sen in annotated_sens:
|
115 |
+
annotated_sen = annotated_sens[id_sen]
|
116 |
+
out = [(s["wordForm"], s["nerLabel"]) for s in annotated_sen]
|
117 |
+
output.append(out)
|
118 |
+
return output
|
119 |
+
|
120 |
+
def word_segment(self, text):
|
121 |
+
from jnius import autoclass
|
122 |
+
javaclass_Annotation = autoclass('vn.pipeline.Annotation')
|
123 |
+
str = self.javaclass_String(text)
|
124 |
+
annotation = javaclass_Annotation(str)
|
125 |
+
self.model.annotate(annotation)
|
126 |
+
list_segmented_sentences = []
|
127 |
+
list_sentences = annotation.toString().split("\n\n")[:-1]
|
128 |
+
for sent in list_sentences:
|
129 |
+
list_words = sent.split("\n")
|
130 |
+
list_segmented_words = []
|
131 |
+
for word in list_words:
|
132 |
+
word = word.replace("\t\t", "\t")
|
133 |
+
list_tags = word.split("\t")
|
134 |
+
list_segmented_words.append(list_tags[1])
|
135 |
+
list_segmented_sentences.append(" ".join(list_segmented_words))
|
136 |
+
return list_segmented_sentences
|
137 |
+
|
138 |
+
def print_out(self, dict_sentences):
|
139 |
+
for sent in dict_sentences.keys():
|
140 |
+
list_dict_words = dict_sentences[sent]
|
141 |
+
for word in list_dict_words:
|
142 |
+
print(str(word["index"]) + "\t" + word["wordForm"] + "\t" + word["posTag"] + "\t" + word["nerLabel"] + "\t" + str(word["head"]) + "\t" + word["depLabel"])
|
143 |
+
print("")
|
144 |
+
|
145 |
+
def annotate_file(self, input_file, output_file):
|
146 |
+
os.chdir(self.current_working_dir)
|
147 |
+
input_str = self.javaclass_String(input_file)
|
148 |
+
output_str = self.javaclass_String(output_file)
|
149 |
+
self.model.processPipeline(input_str, output_str, self.annotators)
|
150 |
+
|
151 |
+
if __name__ == '__main__':
|
152 |
+
download_model(save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
|
153 |
+
model = VnCoreNLP(annotators=["wseg","pos","ner"], save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
|
154 |
+
# output = model.annotate_text("Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây.")
|
155 |
+
# print(output)
|
156 |
+
|
157 |
+
text = "Sau khi tốt nghiệp Trung học năm 1975, ông theo học dự bị Ngoại ngữ tại Đại học Ngoại ngữ (nay là Trường Đại học Hà Nội)."
|
158 |
+
out = model.tokenize(text)
|
159 |
+
print(out)
|
160 |
+
# model.print_out(output)
|
161 |
+
|
function/sentence_embbeding.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import json
|
3 |
+
|
4 |
+
URL_EMBBED_ZH = "http://10.9.3.239:1999/api/v1/extract_feature_zh"
|
5 |
+
URL_EMBBED_EN = "http://10.9.3.239:1999/api/v1/extract_feature_en"
|
6 |
+
URL_EMBBED_BGE = "http://10.9.3.240:5045/api/v1/embedding"
|
7 |
+
|
8 |
+
def embbeded_zh(text: list):
|
9 |
+
try:
|
10 |
+
r = requests.post(URL_EMBBED_ZH, data = json.dumps({
|
11 |
+
"text": text
|
12 |
+
}))
|
13 |
+
embs = r.json()["vectors"]
|
14 |
+
return embs
|
15 |
+
except Exception as ve:
|
16 |
+
print(ve)
|
17 |
+
return []
|
18 |
+
|
19 |
+
|
20 |
+
def embbeded_en(text: list):
|
21 |
+
try:
|
22 |
+
r = requests.post(URL_EMBBED_EN, data = json.dumps({
|
23 |
+
"text": text
|
24 |
+
}))
|
25 |
+
embs = r.json()["vectors"]
|
26 |
+
return embs
|
27 |
+
except Exception as ve:
|
28 |
+
print(ve)
|
29 |
+
return []
|
30 |
+
|
31 |
+
|
32 |
+
def embedded_bge(text: list):
|
33 |
+
try:
|
34 |
+
r = requests.post(URL_EMBBED_BGE, data = json.dumps({
|
35 |
+
"text": text
|
36 |
+
}))
|
37 |
+
embs = r.json()["embeddings"]
|
38 |
+
return embs
|
39 |
+
except Exception as ve:
|
40 |
+
print(ve)
|
41 |
+
return []
|
function/summary_with_llm.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import requests
|
3 |
+
import nltk
|
4 |
+
import re
|
5 |
+
import time
|
6 |
+
|
7 |
+
nltk.download('punkt')
|
8 |
+
|
9 |
+
|
10 |
+
URL_LLAMA = "http://10.9.3.241:8022/api/v1/llama/QnA"
|
11 |
+
VERSION = {
|
12 |
+
"en-llama": "mistral-full", #"llama2-7b",
|
13 |
+
"vi-llama": "mistral-full" #"ic-llama-68k"
|
14 |
+
}
|
15 |
+
|
16 |
+
SYSTEM_PROMPT = ""
|
17 |
+
POST_FIX_Q = ""
|
18 |
+
|
19 |
+
SYSTEM_PROMPT_CHECK_RELEVANT = "Bạn là trợ lý AI giúp mọi người tìm kiếm thông tin. Người dùng sẽ cung cấp cho bạn một câu hỏi. Nhiệm vụ của bạn là trả lời trung thực nhất có thể."
|
20 |
+
PROMPT_RELEVANT = 'Câu trả lời dưới có liên quan đến câu hỏi "{0}" hay không?'
|
21 |
+
|
22 |
+
|
23 |
+
REMOVE_WORDS = []
|
24 |
+
with open("data/remove.txt", 'r') as f:
|
25 |
+
for line in f:
|
26 |
+
line = line.strip()
|
27 |
+
if line:
|
28 |
+
REMOVE_WORDS.append(line.lower())
|
29 |
+
print(REMOVE_WORDS)
|
30 |
+
|
31 |
+
|
32 |
+
def check_answer_contains_remove_words(text):
|
33 |
+
text = text.lower()
|
34 |
+
text = re.sub(r'\s+ ', ' ', text)
|
35 |
+
lst_sen = nltk.sent_tokenize(text)
|
36 |
+
for sen in lst_sen:
|
37 |
+
for pat in REMOVE_WORDS:
|
38 |
+
if re.search(pat, sen) is not None:
|
39 |
+
return True
|
40 |
+
return False
|
41 |
+
|
42 |
+
|
43 |
+
def normalize_text_after_qna_llm(text, prompt: str = ""):
|
44 |
+
|
45 |
+
text = re.sub("^Dựa trên.{1,60}cung cấp,", "", text.strip(), flags=re.I).strip()
|
46 |
+
text = re.sub("^Dự báo.{1,60}là", "", text, flags=re.I).strip()
|
47 |
+
text = re.sub("^.{1,15}dựa trên.{1,60}cung cấp,", "", text, flags=re.I).strip()
|
48 |
+
text = re.sub("^.{1,15}theo thông tin.{1,60}cung cấp,", "", text, flags=re.I).strip()
|
49 |
+
# if len(prompt) > 10:
|
50 |
+
# print("pattern: ", "^" + prompt[:10] + ".{1,40}là")
|
51 |
+
# text = re.sub("^" + prompt[:10] + ".{1,40}là", "", text, flags=re.I).strip()
|
52 |
+
if text and text[0].islower():
|
53 |
+
text = text[0].upper() + text[1:]
|
54 |
+
return text
|
55 |
+
|
56 |
+
|
57 |
+
def created_context(text, max_word=2048):
|
58 |
+
text = re.sub(r'\s+ ', ' ', text)
|
59 |
+
lst_sen = nltk.sent_tokenize(text)
|
60 |
+
count_w = 0
|
61 |
+
contexts = []
|
62 |
+
lst_sen_per_context = []
|
63 |
+
for sen in lst_sen:
|
64 |
+
count_w += sen.count(" ")
|
65 |
+
lst_sen_per_context.append(sen)
|
66 |
+
if count_w > max_word:
|
67 |
+
contexts.append(" ".join(lst_sen_per_context))
|
68 |
+
lst_sen_per_context = []
|
69 |
+
count_w = 0
|
70 |
+
if lst_sen_per_context:
|
71 |
+
contexts.append(" ".join(lst_sen_per_context))
|
72 |
+
return contexts
|
73 |
+
|
74 |
+
|
75 |
+
def get_icgpt_result(questions, contexts, lang, version="vi-llama", system_prompt:str = "", id_cluster="", hash_str: str = ""):
|
76 |
+
json_body = {
|
77 |
+
"questions": questions,
|
78 |
+
"contexts": contexts,
|
79 |
+
"lang": lang,
|
80 |
+
"version": VERSION[version],
|
81 |
+
"system_prompt": system_prompt
|
82 |
+
}
|
83 |
+
try:
|
84 |
+
|
85 |
+
res = requests.post(URL_LLAMA, json=json_body)
|
86 |
+
with open(f"log_llm/requests_llama/{hash_str}_{id_cluster}.txt", "w") as f:
|
87 |
+
f.write(json.dumps(json_body) + "\n")
|
88 |
+
f.write(json.dumps(res.json()) + "\n" + "$"*50)
|
89 |
+
res = res.json()["result"]
|
90 |
+
res = [x.replace("_", " ") for x in res]
|
91 |
+
return res
|
92 |
+
except Exception as ex:
|
93 |
+
print(f"[ERROR] get_icgpt_official_visit: {str(ex)}")
|
94 |
+
with open(f"log_llm/requests_llama_error/{hash_str}_{id_cluster}.txt", "w") as f:
|
95 |
+
f.write(json.dumps(json_body) + "\n")
|
96 |
+
return None
|
97 |
+
|
98 |
+
|
99 |
+
def check_relevant_with_llama(question, content, lang, version="en-llama", max_word_per_context=1024, id_cluster="", hash_str: str = ""):
|
100 |
+
contexts = created_context(content, max_word=max_word_per_context)
|
101 |
+
content = contexts[0] if contexts else ""
|
102 |
+
answer = ""
|
103 |
+
question_all = question.split("#####")
|
104 |
+
check_rel = True
|
105 |
+
for question in question_all:
|
106 |
+
question = question.strip()
|
107 |
+
# question = PROMPT_RELEVANT.format(question)
|
108 |
+
question_split = question.split("$$$$")
|
109 |
+
if len(question_split) < 2:
|
110 |
+
question = question
|
111 |
+
system_prompt = ""
|
112 |
+
else:
|
113 |
+
system_prompt = question_split[0].strip()
|
114 |
+
question = question_split[1].strip()
|
115 |
+
|
116 |
+
# system_prompt = SYSTEM_PROMPT_CHECK_RELEVANT
|
117 |
+
if content:
|
118 |
+
contexts = [content]
|
119 |
+
questions = [question]
|
120 |
+
lst_res = get_icgpt_result(questions, contexts, lang, version=version, system_prompt=system_prompt, id_cluster=f"{id_cluster}_recheck_relevant", hash_str=hash_str)
|
121 |
+
if lst_res is None:
|
122 |
+
lst_res = []
|
123 |
+
check_rel = True
|
124 |
+
# continue
|
125 |
+
# break
|
126 |
+
for an in lst_res:
|
127 |
+
if an:
|
128 |
+
is_relevant = check_answer_contains_remove_words(an)
|
129 |
+
answer = normalize_text_after_qna_llm(an, prompt=question)
|
130 |
+
if len(answer) == 0:
|
131 |
+
check_rel = True
|
132 |
+
# continue
|
133 |
+
# break
|
134 |
+
else:
|
135 |
+
check_rel = is_relevant
|
136 |
+
if not check_rel:
|
137 |
+
break
|
138 |
+
if not check_rel:
|
139 |
+
break
|
140 |
+
else:
|
141 |
+
check_rel = True
|
142 |
+
return check_rel
|
143 |
+
|
144 |
+
|
145 |
+
def summary_with_llama(question, content, lang, version="vi-llama", max_word_per_context=1024, id_cluster="", hash_str: str = ""):
|
146 |
+
contexts = created_context(content, max_word=max_word_per_context)
|
147 |
+
question_split = question.split("$$$$")
|
148 |
+
if len(question_split) < 2:
|
149 |
+
question = question
|
150 |
+
system_prompt = ""
|
151 |
+
else:
|
152 |
+
system_prompt = question_split[0].strip()
|
153 |
+
question = question_split[1].strip()
|
154 |
+
|
155 |
+
if question.strip().endswith(".") or question.strip().endswith("?"):
|
156 |
+
question = (question + POST_FIX_Q).strip()
|
157 |
+
else:
|
158 |
+
question = (question + ". " + POST_FIX_Q).strip()
|
159 |
+
answer = ""
|
160 |
+
if contexts:
|
161 |
+
contexts = [contexts[0]]
|
162 |
+
questions = [question] * len(contexts)
|
163 |
+
lst_res = get_icgpt_result(questions, contexts, lang, version=version, system_prompt=system_prompt, id_cluster=id_cluster, hash_str=hash_str)
|
164 |
+
results = []
|
165 |
+
if lst_res is None:
|
166 |
+
lst_res = []
|
167 |
+
for an in lst_res:
|
168 |
+
|
169 |
+
is_relevant = check_answer_contains_remove_words(an)
|
170 |
+
|
171 |
+
if not is_relevant:
|
172 |
+
results.append(an)
|
173 |
+
# else:
|
174 |
+
# print("ans_from_llama-preview: ", an, is_relevant)
|
175 |
+
if results:
|
176 |
+
if len(results) > 1:
|
177 |
+
context_combine = [". ".join(results)]
|
178 |
+
res = get_icgpt_result([question], context_combine, lang, version=version, system_prompt=system_prompt, id_cluster=id_cluster, hash_str=hash_str)
|
179 |
+
if res:
|
180 |
+
answer = res[0]
|
181 |
+
else:
|
182 |
+
answer = results[0]
|
183 |
+
else:
|
184 |
+
if lst_res:
|
185 |
+
return "", True
|
186 |
+
if answer:
|
187 |
+
is_relevant = check_answer_contains_remove_words(answer)
|
188 |
+
# print("ans_from_llama-before: ", answer, is_relevant)
|
189 |
+
answer = normalize_text_after_qna_llm(answer, prompt=question)
|
190 |
+
|
191 |
+
if len(answer) == 0:
|
192 |
+
is_relevant = True
|
193 |
+
# print("ans_from_llama-after: ", answer, is_relevant)
|
194 |
+
return answer, is_relevant
|
195 |
+
|
196 |
+
|
197 |
+
if __name__ == '__main__':
|
198 |
+
text = """ Dựa trên thông tin được cung cấp, xung đột ở Biển Đông đã gia tăng và có thể xuất hiện điểm xung đột mới giữa Philippines và Trung Quốc. Philippines đang xem xét xây dựng một ngọn hải đăng trên Bãi cạn Second Thomas (còn gọi là Bãi cạn Second Thomas), điều này đã khiến Trung Quốc tuyên bố rằng đây là một hành động vi phạm nghiêm trọng chủ quyền của họ và sẽ đáp trả kiên quyết. Giới chuyên gia cho rằng điểm bùng phát xung đột mới có thể xảy ra giữa Philippines và Trung nước ở Biển Đông. Tham vọng hung hăng của Trung Quốc đã thúc đẩy căng thẳng khu vực và các tranh chấp gần đây về vụ va chạm gần đây của các tàu gần Bãi Cỏ Mây (Second Thomas Shoal) ở Biển Đông đã làm căng th ng giữa Trung Quốc và Philippines gia tăng. """
|
199 |
+
is_re = check_answer_contains_remove_words(text)
|
200 |
+
print(is_re)
|
201 |
+
an = normalize_text_after_qna_llm(text)
|
202 |
+
print(an)
|
203 |
+
# exit(1)
|
204 |
+
# message = "G\u1EA7n \u0111\u00E2y, Philippines \u0111\u00E3 nhi\u1EC1u l\u1EA7n g\u00E2y ra r\u1EAFc r\u1ED1i \u1EDF Bi\u1EC3n \u0110\u00F4ng, x\u00E2m nh\u1EADp \u0110\u00E1 Nh\u00E2n \u00C1i thu\u1ED9c qu\u1EA7n \u0111\u1EA3o Nam Sa c\u1EE7a Trung Qu\u1ED1c, \u0111\u1ED3ng th\u1EDDi ti\u1EBFp t\u1EE5c ph\u00F3ng \u0111\u1EA1i v\u00E0 c\u01B0\u1EDDng \u0111i\u1EC7u h\u00F3a, nh\u1EA7m l\u1EABn \u0111\u00FAng sai v\u00E0 \u0111\u01B0a ra nh\u1EEFng c\u00E1o bu\u1ED9c v\u00F4 l\u00FD ch\u1ED1ng l\u1EA1i Trung Qu\u1ED1c. \\n Ng\u01B0\u1EDDi Philippines ngh\u0129 g\u00EC v\u1EC1 h\u00E0nh \u0111\u1ED9ng c\u1EE7a ch\u00EDnh ph\u1EE7 Philippines? \\ n \u00D4ng Tan \u0111\u00E3 t\u00ECm ra Herman Laurel, m\u1ED9t nh\u00E0 b\u00ECnh lu\u1EADn ch\u00EDnh tr\u1ECB n\u1ED5i ti\u1EBFng \u1EDF Philippines, \u0111\u1ED3ng th\u1EDDi l\u00E0 gi\u00E1m \u0111\u1ED1c Vi\u1EC7n nghi\u00EAn c\u1EE9u chi\u1EBFn l\u01B0\u1EE3c \"Th\u1EBF k\u1EF7 ch\u00E2u \u00C1\" c\u1EE7a Philippines v\u00E0 l\u00E0 ng\u01B0\u1EDDi s\u00E1ng l\u1EADp Hi\u1EC7p h\u1ED9i nghi\u00EAn c\u1EE9u ch\u00EDnh s\u00E1ch BRICS c\u1EE7a Philippines. Herman Laurel, ng\u01B0\u1EDDi \u0111ang nghi\u00EAn c\u1EE9u v\u1EC1 ch\u00E2u \u00C1, \u0111\u1EB7c bi\u1EC7t l\u00E0 m\u1ED1i quan h\u1EC7 gi\u1EEFa Trung Qu\u1ED1c v\u00E0 ASEAN, chia s\u1EBB quan s\u00E1t c\u1EE7a m\u00ECnh. \\n 01 \\n Tan Zhu: K\u1EC3 t\u1EEB n\u1EEDa cu\u1ED1i n\u0103m nay, Philippines th\u01B0\u1EDDng xuy\u00EAn x\u00E2m chi\u1EBFm c\u00E1c \u0111\u1EA3o v\u00E0 b\u00E3i \u0111\u00E1 c\u1EE7a ch\u00FAng t\u00F4i \u1EDF Bi\u1EC3n \u0110\u00F4ng, l\u00E0m gia t\u0103ng c\u0103ng th\u1EB3ng \u1EDF Bi\u1EC3n \u0110\u00F4ng, \u0111\u1ED3ng th\u1EDDi vu kh\u1ED1ng Trung Qu\u1ED1c l\u00E0 k\u1EBB b\u1EAFt n\u1EA1t l\u1EDBn. , c\u00F3 nh\u1EEFng nh\u00F3m \u1EDF Philippines ph\u1EA3n \u0111\u1ED1i h\u00E0nh \u0111\u1ED9ng khi\u00EAu kh\u00EDch c\u1EE7a Philippines. B\u1EA1n c\u1EA3m th\u1EA5y th\u1EBF n\u00E0o v\u1EC1 \u0111i\u1EC1u n\u00E0y v\u1EDBi t\u01B0 c\u00E1ch l\u00E0 m\u1ED9t ng\u01B0\u1EDDi Philippines? Herman Laurel: T\u00F4i cho r\u1EB1ng C\u1EA3nh s\u00E1t bi\u1EC3n Philippines v\u00E0 m\u1ED9t s\u1ED1 t\u00E0u d\u00E2n s\u1EF1 Philippines \u0111\u00E3 c\u1ED1 t\u00ECnh g\u00E2y r\u1EAFc r\u1ED1i v\u00EC h\u1ECD \u0111ang th\u1EF1c hi\u1EC7n \u201CD\u1EF1 \u00E1n t\u1ED5ng th\u1EC3\u201D c\u1EE7a Hoa K\u1EF3. \\nM\u1EF9 khoe khoang d\u1EF1 \u00E1n n\u00E0y v\u1EDBi ch\u00FAng t\u00F4i n\u00EAn ch\u00FAng t\u00F4i bi\u1EBFt t\u00EAn d\u1EF1 \u00E1n. D\u1EF1 \u00E1n \u0111\u01B0\u1EE3c d\u1EABn d\u1EAFt b\u1EDFi \u0110\u1EA1i t\u00E1 Kh\u00F4ng qu\u00E2n Hoa K\u1EF3 \u0111\u00E3 ngh\u1EC9 h\u01B0u Raimundo Powell, l\u00E0m vi\u1EC7c v\u1EDBi c\u00E1c \u0111\u1ED1i t\u00E1c Philippines nh\u01B0 C\u1EA3nh s\u00E1t bi\u1EC3n Philippines v\u00E0 m\u1ED9t s\u1ED1 quan ch\u1EE9c c\u1EE7a B\u1ED9 Ngo\u1EA1i giao Philippines. H\u1ECD l\u00EAn k\u1EBF ho\u1EA1ch, \u00E2m m\u01B0u v\u00E0 th\u1EF1c hi\u1EC7n d\u1EF1 \u00E1n n\u00E0y nh\u1EB1m \u0111\u1ED1i \u0111\u1EA7u, khi\u00EAu kh\u00EDch v\u00E0 \u0111\u1ED1i \u0111\u1EA7u v\u1EDBi Qu\u00E2n \u0111o\u00E0n C\u1EA3nh s\u00E1t bi\u1EC3n thu\u1ED9c L\u1EF1c l\u01B0\u1EE3ng C\u1EA3nh s\u00E1t V\u0169 trang Nh\u00E2n d\u00E2n Trung Qu\u1ED1c. T\u1EA5t c\u1EA3 nh\u1EEFng \u0111i\u1EC1u n\u00E0y \u0111\u1EC1u nh\u1EB1m m\u1EE5c \u0111\u00EDch c\u1ED1 t\u00ECnh t\u1EA1o ra c\u0103ng th\u1EB3ng \u1EDF Bi\u1EC3n \u0110\u00F4ng, \u0111\u1EB7c bi\u1EC7t l\u00E0 t\u1EA1i c\u00E1c khu v\u1EF1c tranh ch\u1EA5p gi\u1EEFa Philippines v\u00E0 Trung Qu\u1ED1c nh\u01B0 B\u00E3i c\u1EA1n Scarborough hay B\u00E3i c\u1EA1n Second Thomas. \\n \\n \\n \\n \\n Vi\u1EC7c Trung Qu\u1ED1c s\u1EED d\u1EE5ng t\u00E0u l\u1EDBn v\u00E0 t\u00E0u chi\u1EBFn l\u00E0 \u0111i\u1EC1u t\u1ED1t. L\u1EE3i th\u1EBF v\u1EC1 s\u1ED1 l\u01B0\u1EE3ng v\u00E0 quy m\u00F4 c\u1EE7a L\u1EF1c l\u01B0\u1EE3ng C\u1EA3nh s\u00E1t bi\u1EC3n thu\u1ED9c L\u1EF1c l\u01B0\u1EE3ng C\u1EA3nh s\u00E1t V\u0169 trang Nh\u00E2n d\u00E2n Trung Qu\u1ED1c th\u1EF1c s\u1EF1 tr\u00E1nh \u0111\u01B0\u1EE3c m\u1ED9t cu\u1ED9c \u0111\u1ED1i \u0111\u1EA7u th\u1EF1c s\u1EF1 nghi\u00EAm tr\u1ECDng h\u01A1n, b\u1EDFi v\u00EC n\u1EBFu hai b\u00EAn c\u00F3 quy m\u00F4 ngang nhau, M\u1ED9t cu\u1ED9c xung \u0111\u1ED9t h\u1EA1m \u0111\u1ED9i c\u00F3 th\u1EC3 tr\u1EDF n\u00EAn s\u1EAFp x\u1EA3y ra. \\ n Nh\u01B0ng d\u00F9 th\u1EBF n\u00E0o \u0111i n\u1EEFa, c\u00E1c ph\u01B0\u01A1ng ti\u1EC7n truy\u1EC1n th\u00F4ng ph\u01B0\u01A1ng T\u00E2y s\u1EBD m\u00F4 t\u1EA3 Trung Qu\u1ED1c l\u00E0 k\u1EBB b\u1EAFt n\u1EA1t. \u0110\u00E2y l\u00E0 m\u1EE5c \u0111\u00EDch th\u1EF1c s\u1EF1 c\u1EE7a d\u1EF1 \u00E1n n\u00E0y v\u00E0 l\u00E0 m\u1EE5c ti\u00EAu m\u00E0 Hoa K\u1EF3 \u0111\u1EB7t ra - h\u1ECD s\u1EED d\u1EE5ng c\u00E1i g\u1ECDi l\u00E0 \"ni\u1EC1m tin v\u00E0o t\u00EDnh minh b\u1EA1ch c\u1EE7a b\u00E1o c\u00E1o\" \u0111\u1EC3 c\u1EA5u th\u00E0nh m\u1ED9t \"\u00E2m m\u01B0u \u0111en t\u1ED1i\" ch\u1ED1ng l\u1EA1i Trung Qu\u1ED1c... c\u00F4ng khai\". \u0110\u01B0\u1EE3c cho l\u00E0 minh b\u1EA1ch v\u00EC truy\u1EC1n th\u00F4ng ph\u01B0\u01A1ng T\u00E2y \u0111\u00E3 mang theo camera, nh\u00E2n vi\u00EAn truy\u1EC1n th\u00F4ng... \u0111\u1EC3 ghi l\u1EA1i qu\u00E1 tr\u00ECnh n\u00E0y v\u00E0 \u0111\u01B0a tin cho kh\u00E1n gi\u1EA3 Philippines v\u00E0 th\u1EBF gi\u1EDBi. \\nHoa K\u1EF3 \u0111\u00E3 ch\u1EC9 ra trong \u0111\u1EC1 xu\u1EA5t d\u1EF1 \u00E1n r\u1EB1ng m\u1EE5c ti\u00EAu c\u1EE7a h\u1ECD l\u00E0 khi\u1EBFn Trung Qu\u1ED1c ph\u1EA3i tr\u1EA3 gi\u00E1. T\u00F4i ngh\u0129 m\u1ECDi ng\u01B0\u1EDDi \u0111\u1EC1u c\u00F3 th\u1EC3 hi\u1EC3u \u0111i\u1EC1u n\u00E0y c\u00F3 ngh\u0129a l\u00E0 g\u00EC - Hoa K\u1EF3 mu\u1ED1n h\u1EE7y ho\u1EA1i danh ti\u1EBFng c\u1EE7a Trung Qu\u1ED1c. \\n C\u00F3 m\u1ED9t th\u1EDDi, Philippines c\u00F3 m\u1ED1i quan h\u1EC7 r\u1EA5t h\u1EEFu \u00EDch v\u00E0 th\u00E2n thi\u1EC7n v\u1EDBi Trung Qu\u1ED1c. C\u00F3 nhi\u1EC1u c\u1EA7u n\u1ED1i h\u1EE3p t\u00E1c gi\u1EEFa ch\u00FAng ta, ch\u1EB3ng h\u1EA1n nh\u01B0 c\u00E1c d\u1EF1 \u00E1n h\u1EE3p t\u00E1c trong vi\u1EC7c c\u00F9ng x\u00E2y d\u1EF1ng s\u00E1ng ki\u1EBFn \u200B\u200B\u201CV\u00E0nh \u0111ai v\u00E0 Con \u0111\u01B0\u1EDDng\u201D, gi\u00FAp t\u0103ng th\u00EAm l\u1EE3i th\u1EBF kinh t\u1EBF c\u1EE7a Philippines. \\n Tuy nhi\u00EAn, T\u1ED5ng th\u1ED1ng Philippines Marcos r\u00F5 r\u00E0ng c\u00F3 quan \u0111i\u1EC3m ri\u00EAng v\u1EC1 s\u1EF1 d\u00E0n x\u1EBFp c\u1EE7a M\u1EF9, \u00F4ng ch\u01B0a bao gi\u1EDD gi\u1EA3i th\u00EDch cho ng\u01B0\u1EDDi d\u00E2n Philippines l\u00FD do quay tr\u1EDF l\u1EA1i v\u1EDBi M\u1EF9. V\u1EC1 v\u1EA5n \u0111\u1EC1 Bi\u1EC3n \u0110\u00F4ng, ch\u00EDnh s\u00E1ch c\u1EE7a Marcos c\u00E0ng th\u00F9 \u0111\u1ECBch v\u00E0 hung h\u0103ng h\u01A1n. \\n Tuy nhi\u00EAn, ng\u01B0\u1EDDi d\u00E2n Philippines kh\u00F4ng quan t\u00E2m \u0111\u1EBFn tranh ch\u1EA5p Bi\u1EC3n \u0110\u00F4ng. \\n \"Asia Pulse\" c\u00F3 th\u1EC3 n\u00F3i l\u00E0 m\u1ED9t t\u1ED5 ch\u1EE9c b\u1ECF phi\u1EBFu, m\u1EE5c \u0111\u00EDch l\u00E0 t\u00ECm hi\u1EC3u mong mu\u1ED1n c\u1EE7a ng\u01B0\u1EDDi d\u00E2n Ch\u00E2u \u00C1. V\u00E0o th\u00E1ng 9 n\u0103m 2023, m\u1ED9t b\u00E1o c\u00E1o th\u0103m d\u00F2 \u00FD ki\u1EBFn \u200B\u200Bcho th\u1EA5y ch\u1EC9 c\u00F3 7% ng\u01B0\u1EDDi d\u00E2n Philippines quan t\u00E2m \u0111\u1EBFn tranh ch\u1EA5p Bi\u1EC3n \u0110\u00F4ng, trong khi 93% ng\u01B0\u1EDDi d\u00E2n th\u1EF1c s\u1EF1 ch\u1EC9 quan t\u00E2m \u0111\u1EBFn gi\u00E1 c\u1EA3 cao, l\u1EA1m ph\u00E1t, vi\u1EC7c l\u00E0m, v\u1EA5n \u0111\u1EC1 t\u1ED9i ph\u1EA1m gia t\u0103ng v\u00E0 v\u1EA5n \u0111\u1EC1 ma t\u00FAy, v.v. . . \\nGi\u1EDD \u0111\u00E2y, ngay c\u1EA3 gia \u0111\u00ECnh Marcos c\u0169ng b\u1ECB chia r\u1EBD. Em g\u00E1i c\u1EE7a Marcos, v\u1EDBi t\u01B0 c\u00E1ch l\u00E0 ch\u1EE7 t\u1ECBch \u1EE6y ban \u0110\u1ED1i ngo\u1EA1i Th\u01B0\u1EE3ng vi\u1EC7n Philippines, \u0111\u00E3 ph\u1EA3n \u0111\u1ED1i vi\u1EC7c Marcos quay sang Hoa K\u1EF3 v\u00E0 h\u00E0nh vi hung h\u0103ng c\u1EE7a Marcos \u1EDF ch\u00E2u \u00C1, \u0111\u1ED3ng th\u1EDDi ch\u1EC9 tr\u00EDch vi\u1EC7c tri\u1EC3n khai c\u00E1c c\u0103n c\u1EE9 c\u1EE7a M\u1EF9 \u1EDF Philippines v\u00E0 nh\u1EEFng c\u0103ng th\u1EB3ng do Hoa K\u1EF3 g\u00E2y ra \u1EDF Philippines. Bi\u1EC3n \u0110\u00F4ng. . \\n 02 \\n Tan Zhu: Sau khi th\u1ED5i ph\u1ED3ng s\u1EF1 c\u1ED1 \u0110\u00E1 Nh\u00E2n \u00C1i, kh\u00F4ng qu\u1ED1c gia n\u00E0o \u1EE7ng h\u1ED9 Philippines l\u00E0 qu\u1ED1c gia ASEAN. Ph\u1EA3i ch\u0103ng \u0111i\u1EC1u n\u00E0y c\u00F3 ngh\u0129a l\u00E0 Philippines \u0111\u00E3 b\u1ECB c\u00F4 l\u1EADp \u1EDF c\u1EA5p \u0111\u1ED9 ngo\u1EA1i giao khu v\u1EF1c do \u0111i \u0111\u1EA7u trong vi\u1EC7c g\u00E2y r\u1EAFc r\u1ED1i ? C\u00E1c n\u01B0\u1EDBc trong khu v\u1EF1c Bi\u1EC3n \u0110\u00F4ng mu\u1ED1n th\u1EA5y lo\u1EA1i Bi\u1EC3n \u0110\u00F4ng nh\u01B0 th\u1EBF n\u00E0o? \\n Herman Laurel: T\u00F4i ngh\u0129 ASEAN r\u00F5 r\u00E0ng kh\u00F4ng mu\u1ED1n c\u0103ng th\u1EB3ng leo thang. \\nTrong cu\u1ED9c ph\u1ECFng v\u1EA5n v\u1EDBi m\u1ED9t t\u1EDD b\u00E1o \u0111\u1ECBa ph\u01B0\u01A1ng \u1EDF Philippines, t\u00F4i n\u00F3i r\u1EB1ng Kh\u00F4ng qu\u00E2n Hoa K\u1EF3 v\u00E0 c\u00E1c l\u1EF1c l\u01B0\u1EE3ng qu\u00E2n s\u1EF1 kh\u00E1c \u0111\u00E3 can thi\u1EC7p v\u00E0o c\u00F4ng vi\u1EC7c c\u1EE7a Philippines \u1EDF Bi\u1EC3n \u0110\u00F4ng, v\u00E0 ASEAN kh\u00F4ng th\u00EDch s\u1EF1 can thi\u1EC7p c\u1EE7a M\u1EF9 v\u00E0o Bi\u1EC3n \u0110\u00F4ng. \\nM\u1EB7c d\u00F9 c\u00E1c n\u01B0\u1EDBc ASEAN r\u1EA5t l\u1ECBch s\u1EF1 v\u1EDBi nhau v\u00E0 kh\u00F4ng tr\u1EF1c ti\u1EBFp n\u00EAu t\u00EAn Philippines nh\u01B0ng h\u1ECD kh\u00F4ng tham gia c\u00E1c h\u00E0nh \u0111\u1ED9ng \u0111\u1ED9c l\u1EADp c\u1EE7a Philippines m\u00E0 c\u00F4 l\u1EADp Philippines. \\nT\u00F4i ngh\u0129 Th\u1EE7 t\u01B0\u1EDBng Singapore L\u00FD Hi\u1EC3n Long v\u00E0 c\u00E1c nh\u00E0 l\u00E3nh \u0111\u1EA1o kh\u00E1c \u0111\u00E3 \u0111\u01B0a ra m\u1ED9t s\u1ED1 b\u00ECnh lu\u1EADn gi\u00E1n ti\u1EBFp \u0111\u1EC1 c\u1EADp \u0111\u1EBFn Philippines v\u00E0 c\u1EA3nh b\u00E1o n\u01B0\u1EDBc n\u00E0y kh\u00F4ng n\u00EAn tr\u1EDF th\u00E0nh chi\u1EBFn tr\u01B0\u1EDDng. \u0110i\u1EC1u n\u00E0y th\u1EC3 hi\u1EC7n r\u1EA5t r\u00F5 m\u1ED1i quan ng\u1EA1i c\u1EE7a Singapore v\u00E0 truy\u1EC1n t\u1EA3i m\u1ED1i quan ng\u1EA1i t\u01B0\u01A1ng t\u1EF1 c\u1EE7a c\u00E1c n\u01B0\u1EDBc ASEAN. \\nT\u00F4i c\u0169ng \u0111ang li\u00EAn h\u1EC7 v\u1EDBi m\u1ED9t s\u1ED1 \u0111\u1EA1i s\u1EE9 qu\u00E1n ASEAN t\u1EA1i Manila. Trong cu\u1ED9c tr\u00F2 chuy\u1EC7n v\u1EDBi h\u1ECD, t\u00F4i \u0111\u00E3 nghe h\u1ECD b\u00E0y t\u1ECF m\u1ED1i quan ng\u1EA1i c\u1EE7a m\u1ED7i n\u01B0\u1EDBc v\u1EC1 c\u00E1c h\u00E0nh \u0111\u1ED9ng hi\u1EC7n t\u1EA1i c\u1EE7a Philippines \u1EDF Bi\u1EC3n \u0110\u00F4ng. \\nTr\u00EAn th\u1EF1c t\u1EBF, t\u00ECnh h\u00ECnh do Marcos v\u00E0 Hoa K\u1EF3 t\u1EA1o ra c\u0169ng \u0111\u00E3 khi\u1EBFn ng\u01B0\u1EDDi d\u00E2n c\u1EE7a ch\u00FAng t\u00F4i lo l\u1EAFng v\u1EC1 an ninh qu\u1ED1c gia. \\n Marcos v\u00E0 c\u00E1c th\u00E0nh vi\u00EAn n\u1ED9i c\u00E1c an ninh c\u1EE7a \u00F4ng \u0111\u00E3 th\u00F4ng b\u00E1o r\u1EB1ng h\u1ECD \u0111ang chuy\u1EC3n s\u1EF1 ch\u00FA \u00FD c\u1EE7a \u0111\u1EA5t n\u01B0\u1EDBc t\u1EEB an ninh n\u1ED9i b\u1ED9 sang an ninh b\u00EAn ngo\u00E0i, t\u1EE9c l\u00E0 h\u1ECD \u0111ang l\u00E0m \u0111i\u1EC1u \u0111\u00F3 m\u00E0 kh\u00F4ng th\u1EF1c s\u1EF1 t\u00ECm ra ai l\u00E0 m\u1ED1i \u0111e d\u1ECDa an ninh qu\u1ED1c gia, ho\u1EA1t \u0111\u1ED9ng \u0111\u1EC3 b\u1EA3o v\u1EC7 an ninh l\u00E3nh th\u1ED5. \\n S\u1EF1 thay \u0111\u1ED5i ch\u00EDnh s\u00E1ch n\u00E0y khi\u1EBFn m\u1ECDi ng\u01B0\u1EDDi l\u1EA7m t\u01B0\u1EDFng r\u1EB1ng ph\u1EA3i ch\u00FA \u00FD \u0111\u1EBFn an ninh b\u00EAn ngo\u00E0i, tuy nhi\u00EAn \u0111\u00E2y kh\u00F4ng ph\u1EA3i l\u00E0 m\u1ED1i \u0111e d\u1ECDa th\u1EF1c s\u1EF1 m\u00E0 ch\u1EC9 l\u00E0 m\u1ED9t \u1EA3o \u1EA3nh, khi\u1EBFn m\u1ECDi ng\u01B0\u1EDDi l\u01A1 l\u00E0 an ninh trong n\u01B0\u1EDBc v\u00E0 l\u00E0m t\u00ECnh h\u00ECnh an ninh trong n\u01B0\u1EDBc tr\u1EDF n\u00EAn t\u1ED3i t\u1EC7 h\u01A1n. \\n C\u00E1ch \u0111\u00E2y kh\u00F4ng l\u00E2u, m\u1ED9t v\u1EE5 \u0111\u00E1nh bom kh\u1EE7ng b\u1ED1 nghi\u00EAm tr\u1ECDng \u0111\u00E3 x\u1EA3y ra t\u1EA1i \u0110\u1EA1i h\u1ECDc Qu\u1ED1c gia Mindanao \u1EDF th\u00E0nh ph\u1ED1 Marawi, khi\u1EBFn nhi\u1EC1u ng\u01B0\u1EDDi Philippines thi\u1EC7t m\u1EA1ng; \u1EDF m\u1ED9t t\u1EC9nh kh\u00E1c, m\u1ED9t v\u1EE5 n\u1ED5 c\u0169ng x\u1EA3y ra tr\u00EAn m\u1ED9t chi\u1EBFc xe bu\u00FDt. \\n \\n \\n \\n \\n V\u00EC v\u1EADy, ch\u00EDnh ph\u1EE7 n\u00E0y th\u1EF1c s\u1EF1 \u0111ang g\u1EB7p r\u1EAFc r\u1ED1i li\u00EAn t\u1EE5c. Khi Marcos v\u1EA5p ph\u1EA3i s\u1EF1 ph\u1EA3n \u0111\u1ED1i \u0111\u1ED1i v\u1EDBi c\u00E1c ch\u00EDnh s\u00E1ch sai l\u1EA7m c\u1EE7a m\u00ECnh, s\u1EF1 \u1EE7ng h\u1ED9 c\u1EE7a c\u00F4ng ch\u00FAng \u0111\u1ED1i v\u1EDBi \u00F4ng \u0111\u00E3 gi\u1EA3m \u00EDt nh\u1EA5t 15%. \\n Nh\u1EEFng t\u00ECnh h\u00ECnh trong n\u01B0\u1EDBc n\u00E0y cho th\u1EA5y n\u0103m 2024 s\u1EBD l\u00E0 m\u1ED9t n\u0103m r\u1EA5t kh\u00F3 kh\u0103n \u0111\u1ED1i v\u1EDBi Marcos. \\nM\u1EF9 \u0111\u00E3 g\u00E2y \u00E1p l\u1EF1c l\u1EDBn, y\u00EAu c\u1EA7u Philippines ti\u1EBFp t\u1EE5c ki\u1EC1m ch\u1EBF Trung Qu\u1ED1c. Nh\u01B0ng t\u00F4i ngh\u0129 \u0111\u1EA1i \u0111a s\u1ED1 ng\u01B0\u1EDDi Philippines c\u0169ng \u0111ang l\u00E0m ph\u1EA7n vi\u1EC7c c\u1EE7a m\u00ECnh \u0111\u1EC3 c\u1ED1 g\u1EAFng t\u00E1i kh\u1EB3ng \u0111\u1ECBnh quan \u0111i\u1EC3m v\u00E0 quan \u0111i\u1EC3m th\u1EF1c s\u1EF1 c\u1EE7a ng\u01B0\u1EDDi Philippines v\u1EC1 Trung Qu\u1ED1c, m\u1ED9t n\u01B0\u1EDBc th\u00E2n thi\u1EC7n, \u1EA5m \u00E1p v\u00E0 hi\u1EC7u qu\u1EA3. \\n 03 \\n Zhu Tan: \u00D4ng t\u1EEBng ch\u1EC9 ra r\u1EB1ng qu\u00E1 tr\u00ECnh chuy\u1EC3n \u0111\u1ED5i kinh t\u1EBF v\u00E0 t\u0103ng tr\u01B0\u1EDFng b\u1EC1n v\u1EEFng c\u1EE7a Philippines kh\u00F4ng th\u1EC3 t\u00E1ch r\u1EDDi vi\u1EC7c duy tr\u00EC h\u1EE3p t\u00E1c ch\u1EB7t ch\u1EBD v\u1EDBi Trung Qu\u1ED1c v\u00E0 ASEAN. T\u1EA1i sao \u00F4ng l\u1EA1i \u0111\u01B0a ra nh\u1EADn \u0111\u1ECBnh nh\u01B0 v\u1EADy? \\n Herman Laurel: Sau khi Marcos nh\u1EADm ch\u1EE9c, ch\u00EDnh s\u00E1ch \u0111\u1ED1i ngo\u1EA1i \u0111\u1ED9c l\u1EADp v\u00E0 quan h\u1EC7 h\u1EEFu ngh\u1ECB c\u1EE7a ch\u00EDnh ph\u1EE7 Philippines v\u1EDBi Trung Qu\u1ED1c v\u00E0 c\u00E1c n\u01B0\u1EDBc l\u00E1ng gi\u1EC1ng ch\u00E2u \u00C1 kh\u00E1c \u0111\u00E3 quay \u0111\u1EA7u v\u00E0 quay sang Hoa K\u1EF3, \u0111i\u1EC1u n\u00E0y mang l\u1EA1i nhi\u1EC1u v\u1EA5n \u0111\u1EC1 cho Philippines, ti\u1EC1m n\u0103ng nghi\u00EAm tr\u1ECDng nh\u1EA5t t\u00E1c \u0111\u1ED9ng v\u1EABn c\u00F3 th\u1EC3 xu\u1EA5t hi\u1EC7n trong l\u0129nh v\u1EF1c kinh t\u1EBF. \\nR\u00F5 r\u00E0ng \u0111\u1ED9ng c\u01A1 ph\u1EE5c h\u1ED3i kinh t\u1EBF to\u00E0n c\u1EA7u sau cu\u1ED9c \u0110\u1EA1i suy tho\u00E1i k\u00E9o d\u00E0i hai n\u0103m r\u01B0\u1EE1i l\u00E0 Trung Qu\u1ED1c. \\ n Li\u00EAn minh Ch\u00E2u \u00C2u hi\u1EC7n \u0111ang trong v\u1EF1c th\u1EB3m suy tho\u00E1i. T\u1ED5ng s\u1EA3n ph\u1EA9m qu\u1ED1c n\u1ED9i (GDP) th\u1EF1c t\u1EBF c\u1EE7a Nh\u1EADt B\u1EA3n l\u1EA7n \u0111\u1EA7u ti\u00EAn t\u0103ng tr\u01B0\u1EDFng \u00E2m trong qu\u00FD 3 n\u0103m nay. Tuy nhi\u00EAn, Qu\u1EF9 Ti\u1EC1n t\u1EC7 Qu\u1ED1c t\u1EBF g\u1EA7n \u0111\u00E2y \u0111\u00E3 n\u00E2ng d\u1EF1 b\u00E1o t\u0103ng tr\u01B0\u1EDFng GDP c\u1EE7a Trung Qu\u1ED1c cho n\u0103m 2023 l\u00EAn 5,4% v\u00E0 n\u0103m 2024. T\u01B0\u01A1ng lai v\u1EABn t\u01B0\u01A1i s\u00E1ng. \\n B\u1EA1n ph\u1EA3i bi\u1EBFt r\u1EB1ng Trung Qu\u1ED1c kh\u00F4ng ch\u1EC9 l\u00E0 \u0111\u1ED1i t\u00E1c th\u01B0\u01A1ng m\u1EA1i l\u1EDBn nh\u1EA5t c\u1EE7a ASEAN m\u00E0 c\u00F2n l\u00E0 \u0111\u1ED1i t\u00E1c th\u01B0\u01A1ng m\u1EA1i l\u1EDBn nh\u1EA5t c\u1EE7a Philippines. \\n Tuy nhi\u00EAn, ch\u00EDnh quy\u1EC1n Marcos \u0111\u00E3 h\u1EE7y b\u1ECF nhi\u1EC1u d\u1EF1 \u00E1n x\u00E2y d\u1EF1ng c\u1EE7a Trung Qu\u1ED1c, m\u1EB7c d\u00F9 Trung Qu\u1ED1c kh\u00F4ng c\u00F3 b\u1EA5t k\u1EF3 t\u00E1c \u0111\u1ED9ng ti\u00EAu c\u1EF1c n\u00E0o v\u1EC1 kinh t\u1EBF \u0111\u1ED1i v\u1EDBi Philippines. \\n T\u00F4i \u0111\u00E3 nh\u1EAFc nh\u1EDF \u0111\u1ED3ng b\u00E0o m\u00ECnh r\u1EB1ng ch\u00FAng ta kh\u00F4ng \u0111\u01B0\u1EE3c qu\u00EAn b\u00E0i h\u1ECDc c\u1EE7a n\u0103m 2014. Khi \u0111\u00F3, d\u1EF1 \u0111o\u00E1n sai l\u1EA7m c\u1EE7a ch\u00EDnh ph\u1EE7 Aquino \u0111\u00E3 khi\u1EBFn s\u1ED1 chu\u1ED1i tr\u1ECB gi\u00E1 h\u00E0ng t\u1EF7 peso m\u00E0 n\u01B0\u1EDBc ta d\u1EF1 \u0111\u1ECBnh xu\u1EA5t sang Trung Qu\u1ED1c ban \u0111\u1EA7u kh\u00F4ng v\u00E0o \u0111\u01B0\u1EE3c Trung Qu\u1ED1c, c\u00E1c nh\u00E0 xu\u1EA5t kh\u1EA9u chu\u1ED1i Philippines ch\u1EC9 c\u00F3 th\u1EC3 \u0111\u1EE9ng nh\u00ECn chu\u1ED1i c\u1EE7a m\u00ECnh th\u1ED1i r\u1EEFa trong nh\u1EEFng l\u00F4 b\u1ECB t\u1EEB ch\u1ED1i trong h\u1ED3 s\u01A1. \\n V\u00EC v\u1EADy, c\u00F3 th\u1EC3 th\u1EA5y r\u1EB1ng m\u1ED9t d\u1EF1 \u0111o\u00E1n sai l\u1EA7m c\u1EE7a ch\u00EDnh ph\u1EE7 Philippines c\u00F3 th\u1EC3 d\u1EABn \u0111\u1EBFn t\u00ECnh th\u1EBF kh\u00F3 kh\u0103n m\u00E0 ng\u01B0\u1EDDi d\u00E2n Philippines b\u00ECnh th\u01B0\u1EDDng ng\u00E0y nay ph\u1EA3i \u0111\u1ED1i m\u1EB7t. Ch\u00FAng t\u00F4i li\u00EAn t\u1EE5c nh\u1EAFc nh\u1EDF ng\u01B0\u1EDDi d\u00E2n trong n\u01B0\u1EDBc r\u1EB1ng ch\u00FAng t\u00F4i kh\u00F4ng mu\u1ED1n nh\u1EEFng t\u00ECnh hu\u1ED1ng n\u00E0y x\u1EA3y ra l\u1EA7n n\u1EEFa. \\nC\u00F3 l\u1EBD ch\u1EC9 c\u00F3 Trung Qu\u1ED1c m\u1EDBi th\u1EF1c s\u1EF1 c\u00F3 th\u1EC3 th\u00FAc \u0111\u1EA9y s\u1EF1 ph\u00E1t tri\u1EC3n kinh t\u1EBF m\u00E0 Philippines c\u1EA7n trong nh\u1EEFng n\u0103m t\u1EDBi. L\u1EA5y n\u0103m t\u1EDBi l\u00E0m v\u00ED d\u1EE5, Hoa K\u1EF3 c\u00F3 th\u1EC3 kh\u00F4ng c\u00F3 \u0111\u1EE7 ngu\u1ED3n l\u1EF1c t\u00E0i ch\u00EDnh \u0111\u1EC3 h\u1ED7 tr\u1EE3 Philippines, Nh\u1EADt B\u1EA3n c\u00F3 th\u1EC3 kh\u00F4ng th\u1EC3 cung c\u1EA5p h\u1ED7 tr\u1EE3 do suy tho\u00E1i kinh t\u1EBF, Ng\u00E2n h\u00E0ng Ph\u00E1t tri\u1EC3n Ch\u00E2u \u00C1 v\u1EABn c\u00F2n m\u1ED9t s\u1ED1 d\u1EF1 \u00E1n c\u0169 c\u1EA7n \u0111\u01B0\u1EE3c ho\u00E0n thi\u1EC7n. ti\u1EBFp t\u1EE5c v\u00E0 c\u00F3 th\u1EC3 kh\u00F4ng c\u00F3 th\u1EDDi gian \u0111\u1EC3 quan t\u00E2m \u0111\u1EBFn Philippines. \\n Trong nh\u1EEFng th\u00E1ng t\u1EDBi, n\u1EBFu ch\u00FAng ta c\u00F3 th\u1EC3 ng\u0103n ch\u1EB7n th\u00E0nh c\u00F4ng vi\u1EC7c Hoa K\u1EF3 ph\u00E1 ho\u1EA1i m\u1ED1i quan h\u1EC7 t\u1ED1t \u0111\u1EB9p c\u1EE7a Philippines v\u1EDBi Trung Qu\u1ED1c, ch\u00FAng ta c\u00F3 th\u1EC3 ti\u1EBFp t\u1EE5c h\u01B0\u1EDBng t\u1EDBi m\u1ED9t t\u01B0\u01A1ng lai t\u1ED1t \u0111\u1EB9p h\u01A1n. \\n V\u00EC v\u1EADy, ch\u00FAng t\u00F4i hy v\u1ECDng r\u1EB1ng v\u00E0o n\u0103m 2024, Philippines c\u00F3 th\u1EC3 xoay chuy\u1EC3n t\u00ECnh th\u1EBF. \\n \\n \\n [Bi\u00EAn t\u1EADp vi\u00EAn: T\u1EC1 L\u00F4i]"
|
205 |
+
# message = "G\u1EA7n \u0111\u00E2y, Philippines \u0111\u00E3 nhi\u1EC1u l\u1EA7n g\u00E2y ra r\u1EAFc r\u1ED1i \u1EDF Bi\u1EC3n \u0110\u00F4ng, x\u00E2m nh\u1EADp \u0110\u00E1 Nh\u00E2n \u00C1i thu\u1ED9c qu\u1EA7n \u0111\u1EA3o Nam Sa c\u1EE7a Trung Qu\u1ED1c, \u0111\u1ED3ng th\u1EDDi ti\u1EBFp t\u1EE5c ph\u00F3ng \u0111\u1EA1i v\u00E0 c\u01B0\u1EDDng \u0111i\u1EC7u h\u00F3a, nh\u1EA7m l\u1EABn \u0111\u00FAng sai v\u00E0 \u0111\u01B0a ra nh\u1EEFng c\u00E1o bu\u1ED9c v\u00F4 l\u00FD ch\u1ED1ng l\u1EA1i Trung Qu\u1ED1c. \\n Ng\u01B0\u1EDDi Philippines ngh\u0129 g\u00EC v\u1EC1 h\u00E0nh \u0111\u1ED9ng c\u1EE7a ch\u00EDnh ph\u1EE7 Philippines? \\ n \u00D4ng Tan \u0111\u00E3 t\u00ECm ra Herman Laurel, m\u1ED9t nh\u00E0 b\u00ECnh lu\u1EADn ch\u00EDnh tr\u1ECB n\u1ED5i ti\u1EBFng \u1EDF Philippines, \u0111\u1ED3ng th\u1EDDi l\u00E0 gi\u00E1m \u0111\u1ED1c Vi\u1EC7n nghi\u00EAn c\u1EE9u chi\u1EBFn l\u01B0\u1EE3c \"Th\u1EBF k\u1EF7 ch\u00E2u \u00C1\" c\u1EE7a Philippines v\u00E0 l\u00E0 ng\u01B0\u1EDDi s\u00E1ng l\u1EADp Hi\u1EC7p h\u1ED9i nghi\u00EAn c\u1EE9u ch\u00EDnh s\u00E1ch BRICS c\u1EE7a Philippines. Herman Laurel, ng\u01B0\u1EDDi \u0111ang nghi\u00EAn c\u1EE9u v\u1EC1 ch\u00E2u \u00C1, \u0111\u1EB7c bi\u1EC7t l\u00E0 m\u1ED1i quan h\u1EC7 gi\u1EEFa Trung Qu\u1ED1c v\u00E0 ASEAN, chia s\u1EBB quan s\u00E1t c\u1EE7a m\u00ECnh. \\n 01 \\n Tan Zhu: K\u1EC3 t\u1EEB n\u1EEDa cu\u1ED1i n\u0103m nay, Philippines th\u01B0\u1EDDng xuy\u00EAn x\u00E2m chi\u1EBFm c\u00E1c \u0111\u1EA3o v\u00E0 b\u00E3i \u0111\u00E1 c\u1EE7a ch\u00FAng t\u00F4i \u1EDF Bi\u1EC3n \u0110\u00F4ng, l\u00E0m gia t\u0103ng c\u0103ng th\u1EB3ng \u1EDF Bi\u1EC3n \u0110\u00F4ng, \u0111\u1ED3ng th\u1EDDi vu kh\u1ED1ng Trung Qu\u1ED1c l\u00E0 k\u1EBB b\u1EAFt n\u1EA1t l\u1EDBn. , c\u00F3 nh\u1EEFng nh\u00F3m \u1EDF Philippines ph\u1EA3n \u0111\u1ED1i h\u00E0nh \u0111\u1ED9ng khi\u00EAu kh\u00EDch c\u1EE7a Philippines. B\u1EA1n c\u1EA3m th\u1EA5y th\u1EBF n\u00E0o v\u1EC1 \u0111i\u1EC1u n\u00E0y v\u1EDBi t\u01B0 c\u00E1ch l\u00E0 m\u1ED9t ng\u01B0\u1EDDi Philippines? Herman Laurel: T\u00F4i cho r\u1EB1ng C\u1EA3nh s\u00E1t bi\u1EC3n Philippines v\u00E0 m\u1ED9t s\u1ED1 t\u00E0u d\u00E2n s\u1EF1 Philippines \u0111\u00E3 c\u1ED1 t\u00ECnh g\u00E2y r\u1EAFc r\u1ED1i v\u00EC h\u1ECD \u0111ang th\u1EF1c hi\u1EC7n \u201CD\u1EF1 \u00E1n t\u1ED5ng th\u1EC3\u201D c\u1EE7a Hoa K\u1EF3. \\nM\u1EF9 khoe khoang d\u1EF1 \u00E1n n\u00E0y v\u1EDBi ch\u00FAng t\u00F4i n\u00EAn ch\u00FAng t\u00F4i bi\u1EBFt t\u00EAn d\u1EF1 \u00E1n. D\u1EF1 \u00E1n \u0111\u01B0\u1EE3c d\u1EABn d\u1EAFt b\u1EDFi \u0110\u1EA1i t\u00E1 Kh\u00F4ng qu\u00E2n Hoa K\u1EF3 \u0111\u00E3 ngh\u1EC9 h\u01B0u Raimundo Powell, l\u00E0m vi\u1EC7c v\u1EDBi c\u00E1c \u0111\u1ED1i t\u00E1c Philippines nh\u01B0 C\u1EA3nh s\u00E1t bi\u1EC3n Philippines v\u00E0 m\u1ED9t s\u1ED1 quan ch\u1EE9c c\u1EE7a B\u1ED9 Ngo\u1EA1i giao Philippines"
|
206 |
+
# qs = "Tóm tắt nội dung liên quan đến Biển Đông và Việt Nam"
|
207 |
+
# lg = "vi"
|
208 |
+
# ver = "en-llama"
|
209 |
+
# ans = summary_with_llama(qs, message, lg, version=ver, max_word_per_context=1024)
|
210 |
+
# print(ans)
|
function/tc_v2.py
ADDED
@@ -0,0 +1,573 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tensorRT import inference
|
2 |
+
import re
|
3 |
+
from collections import Counter
|
4 |
+
from vncorenlp import VnCoreNLP
|
5 |
+
from nltk.tokenize import sent_tokenize
|
6 |
+
import torch
|
7 |
+
import datetime
|
8 |
+
from sklearn.cluster import AgglomerativeClustering
|
9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
+
import numpy as np
|
11 |
+
import json
|
12 |
+
from . import utils
|
13 |
+
import time
|
14 |
+
from summary import text_summary, get_summary_bert
|
15 |
+
from function.clean_text import normalize_text
|
16 |
+
from .summary_with_llm import summary_with_llama
|
17 |
+
from .translate import translate_text_multi_layer
|
18 |
+
from scipy.spatial import distance
|
19 |
+
import copy
|
20 |
+
from .sentence_embbeding import embbeded_zh, embbeded_en, embedded_bge
|
21 |
+
|
22 |
+
|
23 |
+
# from . import detect_time as dt
|
24 |
+
|
25 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
26 |
+
use_cuda = torch.cuda.is_available()
|
27 |
+
print(torch.cuda.is_available())
|
28 |
+
|
29 |
+
# annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx2g')
|
30 |
+
|
31 |
+
|
32 |
+
def detect_postaging(text_in):
|
33 |
+
word_segmented_text = annotator.annotate(text_in)
|
34 |
+
lst_k = []
|
35 |
+
for se in word_segmented_text["sentences"]:
|
36 |
+
for kw in se:
|
37 |
+
if kw["posTag"] in ("Np", "Ny", "N"):
|
38 |
+
if kw["posTag"] == "N" and "_" not in kw["form"]:
|
39 |
+
continue
|
40 |
+
lst_k.append(kw["form"].replace("_", " "))
|
41 |
+
return list(set(lst_k))
|
42 |
+
|
43 |
+
def clean_text(text_in):
|
44 |
+
doc = re.sub('<.*?>', '', text_in)
|
45 |
+
doc = re.sub('(function).*}', ' ', doc)
|
46 |
+
# link
|
47 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
48 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
49 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
50 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
51 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
52 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
53 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
54 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
55 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
56 |
+
|
57 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
58 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
59 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
60 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
61 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
62 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
63 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
64 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
65 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
66 |
+
# escape sequence
|
67 |
+
doc = re.sub('\n', ' ', doc)
|
68 |
+
doc = re.sub('\t', ' ', doc)
|
69 |
+
doc = re.sub('\r', ' ', doc)
|
70 |
+
|
71 |
+
doc = normalize_text(doc)
|
72 |
+
return doc
|
73 |
+
|
74 |
+
|
75 |
+
def data_cleaning(docs):
|
76 |
+
res = []
|
77 |
+
for d in docs:
|
78 |
+
if 'message' in d:
|
79 |
+
# css and js
|
80 |
+
doc = re.sub('<.*?>', '', d['message'])
|
81 |
+
doc = re.sub('(function).*}', ' ', doc)
|
82 |
+
|
83 |
+
# link
|
84 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
85 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
86 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
87 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
88 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
89 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
90 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
91 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
92 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
93 |
+
|
94 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
95 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
96 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
97 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
98 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
99 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
100 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
101 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
102 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
103 |
+
# escape sequence
|
104 |
+
doc = re.sub('\n', ' ', doc)
|
105 |
+
doc = re.sub('\t', ' ', doc)
|
106 |
+
doc = re.sub('\r', ' ', doc)
|
107 |
+
|
108 |
+
d['message'] = doc
|
109 |
+
res.append(d)
|
110 |
+
return res
|
111 |
+
|
112 |
+
|
113 |
+
def segment(docs, lang="vi"):
|
114 |
+
segmented_docs = []
|
115 |
+
for d in docs:
|
116 |
+
# print(d)
|
117 |
+
# if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
|
118 |
+
if len(d.get('message', "")) > 8000:
|
119 |
+
continue
|
120 |
+
if 'snippet' not in d:
|
121 |
+
continue
|
122 |
+
try:
|
123 |
+
if lang == "vi":
|
124 |
+
snippet = d.get('snippet', "")
|
125 |
+
segmented_snippet = ""
|
126 |
+
segmented_sentences_snippet = annotator.tokenize(snippet)
|
127 |
+
for sentence in segmented_sentences_snippet:
|
128 |
+
segmented_snippet += ' ' + ' '.join(sentence)
|
129 |
+
segmented_snippet = segmented_snippet.replace('\xa0', '')
|
130 |
+
d['segmented_snippet'] = segmented_snippet
|
131 |
+
segmented_docs.append(d)
|
132 |
+
except Exception:
|
133 |
+
pass
|
134 |
+
return segmented_docs
|
135 |
+
|
136 |
+
|
137 |
+
def timestamp_to_date(timestamp):
|
138 |
+
return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')
|
139 |
+
|
140 |
+
|
141 |
+
def re_ranking(result_topic, vectors_prompt, sorted_field):
|
142 |
+
lst_score = []
|
143 |
+
lst_ids = []
|
144 |
+
lst_top = []
|
145 |
+
try:
|
146 |
+
for k in result_topic:
|
147 |
+
lst_ids.append(k)
|
148 |
+
if not sorted_field.strip():
|
149 |
+
lst_top.append(len(result_topic[k]))
|
150 |
+
else:
|
151 |
+
lst_top.append(result_topic[k][0]['max_score'])
|
152 |
+
vector_center = result_topic[k][0]["vector"]
|
153 |
+
max_score = 11.0
|
154 |
+
for vec in vectors_prompt:
|
155 |
+
score = distance.cosine(np.array(vec), np.array(vector_center))
|
156 |
+
if score < max_score:
|
157 |
+
max_score = score
|
158 |
+
lst_score.append(max_score)
|
159 |
+
result_topic[k][0]["similarity_score"] = max_score
|
160 |
+
for d in result_topic[k]:
|
161 |
+
d["similarity_score"] = max_score
|
162 |
+
del result_topic[k][0]["vector"]
|
163 |
+
idx = np.argsort(np.array(lst_score))
|
164 |
+
except Exception as ve:
|
165 |
+
return [], lst_ids, lst_top
|
166 |
+
return idx, lst_ids, lst_top
|
167 |
+
|
168 |
+
def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster = 50, delete_message=True, prompt="", hash_str: str= "", vectors_prompt: list = []):
|
169 |
+
print(f'[INFO] sorted_field: {sorted_field}')
|
170 |
+
MAX_DOC_PER_CLUSTER = max_doc_per_cluster
|
171 |
+
|
172 |
+
lst_ids = []
|
173 |
+
lst_top = []
|
174 |
+
lst_res = []
|
175 |
+
idx = []
|
176 |
+
if prompt:
|
177 |
+
idx, lst_ids, lst_top = re_ranking(response, vectors_prompt, sorted_field)
|
178 |
+
print("idx_prompt: ", idx)
|
179 |
+
if len(prompt) == 0 or len(idx) == 0:
|
180 |
+
for i in response:
|
181 |
+
lst_ids.append(i)
|
182 |
+
if not sorted_field.strip():
|
183 |
+
lst_top.append(len(response[i]))
|
184 |
+
else:
|
185 |
+
lst_top.append(response[i][0]['max_score'])
|
186 |
+
idx = np.argsort(np.array(lst_top))[::-1]
|
187 |
+
print("idx_not_prompt: ", idx)
|
188 |
+
if top_cluster == -1:
|
189 |
+
top_cluster = len(idx)
|
190 |
+
for i in idx[: top_cluster]:
|
191 |
+
ik = lst_ids[i]
|
192 |
+
if top_sentence == -1:
|
193 |
+
top_sentence = len(response[ik])
|
194 |
+
lst_check_title = []
|
195 |
+
lst_check_not_title = []
|
196 |
+
i_c_t = 0
|
197 |
+
for resss in response[ik]:
|
198 |
+
r_title = resss.get("title", "")
|
199 |
+
if r_title and not r_title.endswith("..."):
|
200 |
+
lst_check_title.append(resss)
|
201 |
+
i_c_t += 1
|
202 |
+
else:
|
203 |
+
lst_check_not_title.append(resss)
|
204 |
+
if i_c_t == top_sentence:
|
205 |
+
break
|
206 |
+
if i_c_t == top_sentence:
|
207 |
+
lst_res.append(lst_check_title)
|
208 |
+
else:
|
209 |
+
lst_check_title.extend(lst_check_not_title)
|
210 |
+
lst_res.append(lst_check_title[:top_sentence])
|
211 |
+
#lst_res.append(response[ik][:top_sentence])
|
212 |
+
dict_res = {}
|
213 |
+
for i in range(len(lst_res)):
|
214 |
+
dict_res[str(i + 1)] = lst_res[i][:MAX_DOC_PER_CLUSTER]
|
215 |
+
for j in range(min(len(dict_res[str(i + 1)]), 3)):
|
216 |
+
dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
|
217 |
+
# t11 = time.time()
|
218 |
+
summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary, title=dict_res[str(i + 1)][0].get("title", ""), snippet=dict_res[str(i + 1)][0].get("snippet", ""))
|
219 |
+
# print("time_summary: ", time.time() - t11)
|
220 |
+
if len(summary_text) < 10:
|
221 |
+
summary_text = dict_res[str(i + 1)][0].get("snippet", "")
|
222 |
+
if len(summary_text) < 10:
|
223 |
+
summary_text = dict_res[str(i + 1)][0].get("title", "")
|
224 |
+
summary_text = utils.remove_image_keyword(summary_text)
|
225 |
+
# if prompt:
|
226 |
+
# if dict_res[str(i + 1)][0].get("message", ""):
|
227 |
+
# src_lang = dict_res[str(i + 1)][0].get("lang", "")
|
228 |
+
# print("src_lang: ", src_lang)
|
229 |
+
# print("summary_text: ", summary_text)
|
230 |
+
# summary_text = translate_text_multi_layer(src_lang, "vi", summary_text)
|
231 |
+
# text_tran = translate_text_multi_layer(src_lang, "vi", dict_res[str(i + 1)][0].get("message", ""))
|
232 |
+
# ans_from_llama = summary_with_llama(prompt, text_tran, "vi", version="vi-llama", max_word_per_context=1000)
|
233 |
+
# print("ans_from_llama: ", ans_from_llama)
|
234 |
+
# summary_text = summary_text + "$$$$\n" + ans_from_llama
|
235 |
+
# print("summary_text: ", summary_text, len(summary_text))
|
236 |
+
dict_res[str(i + 1)][0]["content_summary"] = summary_text
|
237 |
+
dict_res[str(i + 1)][0]["num_of_post"] = len(lst_res[i])
|
238 |
+
kew_phares = []
|
239 |
+
dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
|
240 |
+
|
241 |
+
# print("delete_message: ", delete_message)
|
242 |
+
if delete_message:
|
243 |
+
for j in range(len(dict_res[str(i + 1)])):
|
244 |
+
if "message" in dict_res[str(i + 1)][j]:
|
245 |
+
del dict_res[str(i + 1)][j]["message"]
|
246 |
+
|
247 |
+
with open(f"log_llm/topic_result_after_postprocessing/{hash_str}.json", "w") as f:
|
248 |
+
dict_log_pos = {}
|
249 |
+
for k in dict_res:
|
250 |
+
dict_log_pos[k] = copy.deepcopy(dict_res[k])
|
251 |
+
for d in dict_log_pos[k]:
|
252 |
+
if "message" in d:
|
253 |
+
del d["message"]
|
254 |
+
if "vector" in d:
|
255 |
+
del d["vector"]
|
256 |
+
json.dump(dict_log_pos, f, ensure_ascii= False)
|
257 |
+
return dict_res
|
258 |
+
|
259 |
+
|
260 |
+
def get_lang(docs):
|
261 |
+
lang_vi = 0
|
262 |
+
lang_en = 0
|
263 |
+
dict_lang = {}
|
264 |
+
for d in docs:
|
265 |
+
lang = d.get("lang", "")
|
266 |
+
if lang not in dict_lang:
|
267 |
+
dict_lang[lang] = 0
|
268 |
+
dict_lang[lang] += 1
|
269 |
+
# if d.get("lang", "") == "vi":
|
270 |
+
# lang_vi += 1
|
271 |
+
# else:
|
272 |
+
# lang_en += 1
|
273 |
+
lst_lang = []
|
274 |
+
lst_cnt = []
|
275 |
+
for k in dict_lang:
|
276 |
+
lst_lang.append(k)
|
277 |
+
lst_cnt.append(dict_lang[k])
|
278 |
+
idx_max = np.argsort(np.array(lst_cnt))[::-1][0]
|
279 |
+
lang = lst_lang[int(idx_max)]
|
280 |
+
|
281 |
+
if lang.startswith("zh_"):
|
282 |
+
lang = "zh"
|
283 |
+
print("lang: ", lang, lst_cnt[int(idx_max)])
|
284 |
+
return lang
|
285 |
+
|
286 |
+
|
287 |
+
def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50,
|
288 |
+
delete_message=True, prompt="", type_cluster:str = "single", hash_str: str= "", id_topic=""):
|
289 |
+
# global model, model_en
|
290 |
+
with open("data/topic_name.txt") as f:
|
291 |
+
dict_topic_name = json.load(f)
|
292 |
+
topic_name_relevant = dict_topic_name.get(id_topic , "")
|
293 |
+
docs = docs[:30000]
|
294 |
+
lang = get_lang(docs)
|
295 |
+
if type_cluster == "complete" and lang == "zh":
|
296 |
+
distance_threshold = 0.4
|
297 |
+
if type_cluster == "complete" and lang == "en":
|
298 |
+
distance_threshold = 0.4
|
299 |
+
# type_cluster = "single"
|
300 |
+
|
301 |
+
result = {}
|
302 |
+
cluster_score = {}
|
303 |
+
cluster_real_vectors = {}
|
304 |
+
# docs = segment(docs, lang=lang)
|
305 |
+
|
306 |
+
t1 = time.time()
|
307 |
+
if len(docs) < 1:
|
308 |
+
return result
|
309 |
+
elif len(docs) == 1:
|
310 |
+
return {
|
311 |
+
"0": docs
|
312 |
+
}
|
313 |
+
vec_prompt = []
|
314 |
+
prompt_strips = []
|
315 |
+
# prompt = ""
|
316 |
+
if topic_name_relevant:
|
317 |
+
prompt_split = topic_name_relevant.split("#####")
|
318 |
+
for prom in prompt_split:
|
319 |
+
sys_p = prom.strip().split("$$$$")
|
320 |
+
if len(sys_p) == 1:
|
321 |
+
prompt_strips.append(prom.strip())
|
322 |
+
else:
|
323 |
+
prompt_strips.append(sys_p[1].strip())
|
324 |
+
if lang == "zh":
|
325 |
+
vec_prompt = embbeded_zh(prompt_split)
|
326 |
+
elif lang == "en":
|
327 |
+
vec_prompt = embbeded_en(prompt_split)
|
328 |
+
else:
|
329 |
+
vec_prompt = inference.encode(prompt_split, lang=lang)
|
330 |
+
if lang == "zh":
|
331 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
332 |
+
vectors = embbeded_zh(features)
|
333 |
+
# vectors = embedded_bge(features)
|
334 |
+
if len(vectors) == 0:
|
335 |
+
print(f"[WARNING] Embedded {lang}: {len(vectors)} / {len(features)}")
|
336 |
+
vectors = inference.encode(features, lang=lang)
|
337 |
+
# vectors = model.encode(features, show_progress_bar=False)
|
338 |
+
elif lang == "en":
|
339 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
340 |
+
vectors = embbeded_en(features)
|
341 |
+
# vectors = embedded_bge(features)
|
342 |
+
if len(vectors) == 0:
|
343 |
+
print(f"[WARNING] Embedded {lang}: {len(vectors)} / {len(features)}")
|
344 |
+
vectors = inference.encode(features, lang=lang)
|
345 |
+
else:
|
346 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
347 |
+
# vectors = embedded_bge(features)
|
348 |
+
# if len(vectors) == 0:
|
349 |
+
# print(f"[WARNING] Embedded {lang}: {len(vectors)} / {len(features)}")
|
350 |
+
vectors = inference.encode(features, lang=lang)
|
351 |
+
# vectors = model_en.encode(features, show_progress_bar=False)
|
352 |
+
clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
|
353 |
+
linkage=type_cluster, distance_threshold=distance_threshold)
|
354 |
+
clusteror.fit(vectors)
|
355 |
+
matrix_vec = np.stack(vectors, axis=0)
|
356 |
+
print(f"Time encode + clustering: {time.time() - t1} {clusteror.n_clusters_}")
|
357 |
+
for i in range(clusteror.n_clusters_):
|
358 |
+
result[str(i + 1)] = []
|
359 |
+
cluster_score[str(i + 1)] = 0
|
360 |
+
ids = clusteror.labels_ # == i
|
361 |
+
# cluster_real_vectors[str(i + 1)] = re_clustering(ids, matrix_vec, distance_threshold, max_doc_per_cluster)
|
362 |
+
|
363 |
+
for i in range(len(clusteror.labels_)):
|
364 |
+
cluster_no = clusteror.labels_[i]
|
365 |
+
# if any((cluster_real_vectors[str(cluster_no+1)][:] == vectors[i]).all(1)):
|
366 |
+
if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
|
367 |
+
response_doc = {}
|
368 |
+
response_doc = docs[i]
|
369 |
+
score = response_doc.get('score', 0)
|
370 |
+
if not docs[i].get('message','').strip():
|
371 |
+
continue
|
372 |
+
if score > cluster_score[str(cluster_no + 1)]:
|
373 |
+
cluster_score[str(cluster_no + 1)] = score
|
374 |
+
if 'domain' in docs[i]:
|
375 |
+
response_doc['domain'] = docs[i]['domain']
|
376 |
+
if 'url' in docs[i]:
|
377 |
+
response_doc['url'] = docs[i]['url']
|
378 |
+
if 'title' in docs[i]:
|
379 |
+
response_doc['title'] = clean_text(docs[i]['title'])
|
380 |
+
if 'snippet' in docs[i]:
|
381 |
+
response_doc['snippet'] = clean_text(docs[i]['snippet'])
|
382 |
+
if 'created_time' in docs[i]:
|
383 |
+
response_doc['created_time'] = docs[i]['created_time']
|
384 |
+
if "sentiment" in docs[i]:
|
385 |
+
response_doc['sentiment'] = docs[i]['sentiment']
|
386 |
+
if 'message' in docs[i]:
|
387 |
+
title = docs[i].get('title','')
|
388 |
+
snippet = docs[i].get('snippet','')
|
389 |
+
message = docs[i].get('message','')
|
390 |
+
if title.strip():
|
391 |
+
split_mess = message.split(title)
|
392 |
+
if len(split_mess) > 1:
|
393 |
+
message = title.join(split_mess[1:])
|
394 |
+
if snippet.strip():
|
395 |
+
split_mess = message.split(snippet)
|
396 |
+
if len(split_mess) > 1:
|
397 |
+
message = snippet.join(split_mess[1:])
|
398 |
+
|
399 |
+
response_doc['message'] = clean_text(message)
|
400 |
+
if 'id' in docs[i]:
|
401 |
+
response_doc['id'] = docs[i]['id']
|
402 |
+
# response_doc['score'] = 0.0
|
403 |
+
response_doc['title_summarize'] = []
|
404 |
+
response_doc['content_summary'] = ""
|
405 |
+
response_doc['total_facebook_viral'] = 0
|
406 |
+
response_doc["vector"] = np.array(vectors[i]).tolist()
|
407 |
+
result[str(cluster_no + 1)].append(response_doc)
|
408 |
+
empty_clus_ids = []
|
409 |
+
for x in result:
|
410 |
+
result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
|
411 |
+
if len( result[x]) > 0:
|
412 |
+
if len(result[x]) > 1:
|
413 |
+
result[x] = check_duplicate_title_domain(result[x])
|
414 |
+
result[x][0]['num_docs'] = len(result[x])
|
415 |
+
result[x][0]['max_score'] = cluster_score[x]
|
416 |
+
else:
|
417 |
+
empty_clus_ids.append(x)
|
418 |
+
|
419 |
+
for x in empty_clus_ids:
|
420 |
+
result.pop(x,None)
|
421 |
+
# result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
|
422 |
+
with open(f"log_llm/topic_result_before_postprocessing/{hash_str}.json", "w") as f:
|
423 |
+
dict_log = {}
|
424 |
+
for k in result:
|
425 |
+
dict_log[k] = copy.deepcopy(result[k])
|
426 |
+
for d in dict_log[k]:
|
427 |
+
if "message" in d:
|
428 |
+
del d["message"]
|
429 |
+
if "vector" in d:
|
430 |
+
del d["vector"]
|
431 |
+
json.dump(dict_log, f, ensure_ascii= False)
|
432 |
+
return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=delete_message,
|
433 |
+
prompt=topic_name_relevant, hash_str=hash_str, vectors_prompt=vec_prompt)
|
434 |
+
|
435 |
+
def check_duplicate_title_domain(docs):
|
436 |
+
lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
|
437 |
+
for i in range(1,len(lst_title_domain) -1):
|
438 |
+
for j in range(i+1,len(lst_title_domain)):
|
439 |
+
if lst_title_domain[j] == lst_title_domain[i]:
|
440 |
+
lst_title_domain[j] = 'dup'
|
441 |
+
lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
|
442 |
+
return lst_filter_docs
|
443 |
+
def convert_date(text):
|
444 |
+
text = text.replace(".", "/")
|
445 |
+
text = text.replace("-", "/")
|
446 |
+
return text
|
447 |
+
|
448 |
+
|
449 |
+
def check_keyword(sentence):
|
450 |
+
keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
|
451 |
+
for k in keyword:
|
452 |
+
if k in sentence:
|
453 |
+
return True
|
454 |
+
return False
|
455 |
+
|
456 |
+
|
457 |
+
def extract_events_and_time(docs, publish_date):
|
458 |
+
def standardize(date_str):
|
459 |
+
return date_str.replace('.', '/').replace('-', '/')
|
460 |
+
|
461 |
+
def add_0(date_str):
|
462 |
+
|
463 |
+
date_str = date_str.split('/')
|
464 |
+
res = []
|
465 |
+
for o in date_str:
|
466 |
+
o = re.sub('\s+', '', o)
|
467 |
+
if len(o) < 2:
|
468 |
+
o = '0' + o
|
469 |
+
res.append(o)
|
470 |
+
date_str = '/'.join(res)
|
471 |
+
return date_str
|
472 |
+
|
473 |
+
def get_date_list(reg, sentence):
|
474 |
+
find_object = re.finditer(reg, sentence)
|
475 |
+
date_list = [x.group() for x in find_object]
|
476 |
+
return date_list
|
477 |
+
|
478 |
+
year = publish_date.split('/')[2]
|
479 |
+
|
480 |
+
# dd/mm/yyyy
|
481 |
+
reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
|
482 |
+
# #mm/yyyy
|
483 |
+
# reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
|
484 |
+
# dd/mm
|
485 |
+
reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'
|
486 |
+
|
487 |
+
# ngày dd tháng mm năm yyyy
|
488 |
+
reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
|
489 |
+
# ngày dd tháng mm
|
490 |
+
reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'
|
491 |
+
|
492 |
+
result = []
|
493 |
+
for d in docs:
|
494 |
+
text = d['message']
|
495 |
+
for sentence in sent_tokenize(text):
|
496 |
+
lower_sentence = sentence.lower()
|
497 |
+
c = re.search(reg_exp_3, sentence.lower())
|
498 |
+
d = re.search(reg_exp_4, sentence.lower())
|
499 |
+
# e = re.search(reg_exp_5, sentence.lower())
|
500 |
+
a = re.search(reg_exp_1, sentence)
|
501 |
+
b = re.search(reg_exp_2, sentence)
|
502 |
+
#
|
503 |
+
if (a or b or c or d) and check_keyword(lower_sentence):
|
504 |
+
date_list = get_date_list(reg_exp_1, lower_sentence)
|
505 |
+
date_entity = ''
|
506 |
+
if date_list:
|
507 |
+
date_entity = add_0(standardize(date_list[0]))
|
508 |
+
elif get_date_list(reg_exp_2, lower_sentence):
|
509 |
+
date_list = get_date_list(reg_exp_2, lower_sentence)
|
510 |
+
date_entity = add_0(standardize(date_list[0]) + '/' + year)
|
511 |
+
elif get_date_list(reg_exp_3, lower_sentence):
|
512 |
+
date_list = get_date_list(reg_exp_3, lower_sentence)
|
513 |
+
|
514 |
+
date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
|
515 |
+
date_entity = re.sub('\s+', ' ', date_entity)
|
516 |
+
date_entity = date_entity.replace(' ', '/')
|
517 |
+
date_entity = add_0(date_entity)
|
518 |
+
else:
|
519 |
+
date_list = get_date_list(reg_exp_4, lower_sentence)
|
520 |
+
if date_list != []:
|
521 |
+
date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
|
522 |
+
date_entity = re.sub('\s+', ' ', date_entity)
|
523 |
+
date_entity = date_entity.replace(' ', '/')
|
524 |
+
date_entity = date_entity + '/' + year
|
525 |
+
date_entity = add_0(date_entity)
|
526 |
+
result.append((sentence, date_entity))
|
527 |
+
return result
|
528 |
+
|
529 |
+
def find_index_nearest_vector(cluster, vectors):
|
530 |
+
# Compute the centroid of the cluster
|
531 |
+
centroid = np.mean(cluster, axis=0, keepdims=True)
|
532 |
+
|
533 |
+
# Calculate the Euclidean distance between each vector and the centroid
|
534 |
+
distances = cosine_similarity(centroid, vectors)
|
535 |
+
|
536 |
+
# Find the index of the vector with the minimum distance
|
537 |
+
nearest_index = np.argmin(distances, axis=1)
|
538 |
+
|
539 |
+
|
540 |
+
return nearest_index
|
541 |
+
|
542 |
+
def re_clustering(ids, vectors, distance_threshold, max_doc_per_cluster):
|
543 |
+
sub_vectors = vectors[ids]
|
544 |
+
|
545 |
+
try:
|
546 |
+
if sub_vectors.shape[0] < 2:
|
547 |
+
return sub_vectors
|
548 |
+
sub_clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
|
549 |
+
linkage='complete', distance_threshold=0.12)
|
550 |
+
sub_clusteror.fit(sub_vectors)
|
551 |
+
dict_cluster = {id_clus: sub_vectors[sub_clusteror.labels_ == id_clus] for id_clus in range(sub_clusteror.n_clusters_)}
|
552 |
+
dict_num_vec = {id_clus: v.shape[0] for id_clus, v in dict_cluster.items()}
|
553 |
+
|
554 |
+
max_num_cluster = max(dict_num_vec, key=dict_num_vec.get)
|
555 |
+
other_vectors = sub_vectors[sub_clusteror.labels_ != max_num_cluster]
|
556 |
+
|
557 |
+
# if other_vectors.shape[0]:
|
558 |
+
# while dict_num_vec[max_num_cluster] < max_doc_per_cluster:
|
559 |
+
# tmp_index_vec = find_index_nearest_vector(dict_cluster[max_num_cluster], other_vectors)
|
560 |
+
# dict_cluster[max_num_cluster] = np.vstack((dict_cluster[max_num_cluster], other_vectors[tmp_index_vec]))
|
561 |
+
# dict_num_vec[max_num_cluster] += 1
|
562 |
+
# if other_vectors.shape[0] != 1:
|
563 |
+
# other_vectors = np.delete(other_vectors, tmp_index_vec, axis=0)
|
564 |
+
# else:
|
565 |
+
# break
|
566 |
+
cosine_scores = cosine_similarity(dict_cluster[max_num_cluster], dict_cluster[max_num_cluster])
|
567 |
+
with open("/home/vietle/topic-clustering/log_score.txt", "a") as f:
|
568 |
+
f.write(str(cosine_scores) + "\n")
|
569 |
+
return dict_cluster[max_num_cluster]
|
570 |
+
except Exception as e:
|
571 |
+
with open("/home/vietle/topic-clustering/log_clustering_diemtin/log_cluster_second.txt", "a") as f:
|
572 |
+
f.write(str(e)+"$$"+json.dumps({"ids": ids.tolist(), "vectors": vectors.tolist()}))
|
573 |
+
return sub_vectors
|
function/topic_clustering.py
ADDED
@@ -0,0 +1,458 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from email import message
|
2 |
+
import re
|
3 |
+
from vncorenlp import VnCoreNLP
|
4 |
+
from nltk.tokenize import sent_tokenize
|
5 |
+
import torch
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
import datetime
|
8 |
+
from sklearn.cluster import AgglomerativeClustering
|
9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
+
import numpy as np
|
11 |
+
import requests
|
12 |
+
import json
|
13 |
+
from . import utils
|
14 |
+
import time
|
15 |
+
from summary import text_summary, get_summary_bert
|
16 |
+
from function.clean_text import normalize_text
|
17 |
+
# from . import detect_time as dt
|
18 |
+
|
19 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
20 |
+
model = SentenceTransformer('model/distiluse-base-multilingual-cased-v2').to(device)
|
21 |
+
# model = SentenceTransformer('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base').to(device)
|
22 |
+
# model.save('model/distiluse-base-multilingual-cased-v2')
|
23 |
+
|
24 |
+
use_cuda = torch.cuda.is_available()
|
25 |
+
print(torch.cuda.is_available())
|
26 |
+
if torch.cuda.is_available():
|
27 |
+
model_en = SentenceTransformer('model/paraphrase-mpnet-base-v2').to(device)
|
28 |
+
else:
|
29 |
+
model_en = model
|
30 |
+
# model_en.save('model/paraphrase-mpnet-base-v2')
|
31 |
+
annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx8g')
|
32 |
+
|
33 |
+
|
34 |
+
def detect_postaging(text_in):
|
35 |
+
word_segmented_text = annotator.annotate(text_in)
|
36 |
+
lst_k = []
|
37 |
+
for se in word_segmented_text["sentences"]:
|
38 |
+
for kw in se:
|
39 |
+
if kw["posTag"] in ("Np", "Ny", "N"):
|
40 |
+
if kw["posTag"] == "N" and "_" not in kw["form"]:
|
41 |
+
continue
|
42 |
+
lst_k.append(kw["form"].replace("_", " "))
|
43 |
+
return list(set(lst_k))
|
44 |
+
|
45 |
+
def clean_text(text_in):
|
46 |
+
doc = re.sub('<.*?>', '', text_in)
|
47 |
+
doc = re.sub('(function).*}', ' ', doc)
|
48 |
+
# link
|
49 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
50 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
51 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
52 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
53 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
54 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
55 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
56 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
57 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
58 |
+
|
59 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
60 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
61 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
62 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
63 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
64 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
65 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
66 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
67 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
68 |
+
# escape sequence
|
69 |
+
doc = re.sub('\n', ' ', doc)
|
70 |
+
doc = re.sub('\t', ' ', doc)
|
71 |
+
doc = re.sub('\r', ' ', doc)
|
72 |
+
|
73 |
+
doc = normalize_text(doc)
|
74 |
+
return doc
|
75 |
+
|
76 |
+
|
77 |
+
def data_cleaning(docs):
|
78 |
+
res = []
|
79 |
+
for d in docs:
|
80 |
+
if 'message' in d:
|
81 |
+
# css and js
|
82 |
+
doc = re.sub('<.*?>', '', d['message'])
|
83 |
+
doc = re.sub('(function).*}', ' ', doc)
|
84 |
+
|
85 |
+
# link
|
86 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
87 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
88 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
89 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
90 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
91 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
92 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
93 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
94 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
95 |
+
|
96 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
97 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
98 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
99 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
100 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
101 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
102 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
103 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
104 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
105 |
+
# escape sequence
|
106 |
+
doc = re.sub('\n', ' ', doc)
|
107 |
+
doc = re.sub('\t', ' ', doc)
|
108 |
+
doc = re.sub('\r', ' ', doc)
|
109 |
+
|
110 |
+
d['message'] = doc
|
111 |
+
res.append(d)
|
112 |
+
return res
|
113 |
+
|
114 |
+
|
115 |
+
def segment(docs, lang="vi"):
|
116 |
+
segmented_docs = []
|
117 |
+
for d in docs:
|
118 |
+
# if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
|
119 |
+
if len(d.get('message', "")) > 8000:
|
120 |
+
continue
|
121 |
+
if 'snippet' not in d:
|
122 |
+
continue
|
123 |
+
try:
|
124 |
+
if lang == "vi":
|
125 |
+
snippet = d.get('snippet', "")
|
126 |
+
segmented_snippet = ""
|
127 |
+
segmented_sentences_snippet = annotator.tokenize(snippet)
|
128 |
+
for sentence in segmented_sentences_snippet:
|
129 |
+
segmented_snippet += ' ' + ' '.join(sentence)
|
130 |
+
segmented_snippet = segmented_snippet.replace('\xa0', '')
|
131 |
+
d['segmented_snippet'] = segmented_snippet
|
132 |
+
segmented_docs.append(d)
|
133 |
+
except Exception:
|
134 |
+
pass
|
135 |
+
return segmented_docs
|
136 |
+
|
137 |
+
|
138 |
+
def timestamp_to_date(timestamp):
|
139 |
+
return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')
|
140 |
+
|
141 |
+
|
142 |
+
def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster = 50):
|
143 |
+
print(f'[INFO] sorted_field: {sorted_field}')
|
144 |
+
MAX_DOC_PER_CLUSTER = max_doc_per_cluster
|
145 |
+
|
146 |
+
lst_ids = []
|
147 |
+
lst_top = []
|
148 |
+
lst_res = []
|
149 |
+
for i in response:
|
150 |
+
lst_ids.append(i)
|
151 |
+
|
152 |
+
if not sorted_field.strip():
|
153 |
+
lst_top.append(len(response[i]))
|
154 |
+
else:
|
155 |
+
lst_top.append(response[i][0]['max_score'])
|
156 |
+
|
157 |
+
idx = np.argsort(np.array(lst_top))[::-1]
|
158 |
+
if top_cluster == -1:
|
159 |
+
top_cluster = len(idx)
|
160 |
+
for i in idx[: top_cluster]:
|
161 |
+
ik = lst_ids[i]
|
162 |
+
if top_sentence == -1:
|
163 |
+
top_sentence = len(response[ik])
|
164 |
+
lst_check_title = []
|
165 |
+
lst_check_not_title = []
|
166 |
+
i_c_t = 0
|
167 |
+
for resss in response[ik]:
|
168 |
+
r_title = resss.get("title", "")
|
169 |
+
if r_title and not r_title.endswith("..."):
|
170 |
+
lst_check_title.append(resss)
|
171 |
+
i_c_t += 1
|
172 |
+
else:
|
173 |
+
lst_check_not_title.append(resss)
|
174 |
+
if i_c_t == top_sentence:
|
175 |
+
break
|
176 |
+
if i_c_t == top_sentence:
|
177 |
+
lst_res.append(lst_check_title)
|
178 |
+
else:
|
179 |
+
lst_check_title.extend(lst_check_not_title)
|
180 |
+
lst_res.append(lst_check_title[:top_sentence])
|
181 |
+
#lst_res.append(response[ik][:top_sentence])
|
182 |
+
dict_res = {}
|
183 |
+
for i in range(len(lst_res)):
|
184 |
+
dict_res[str(i + 1)] = lst_res[i][:MAX_DOC_PER_CLUSTER]
|
185 |
+
for j in range(min(len(dict_res[str(i + 1)]), 3)):
|
186 |
+
dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
|
187 |
+
summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), lang=dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary, title=dict_res[str(i + 1)][0].get("title", ""), snippet=dict_res[str(i + 1)][0].get("snippet", ""))
|
188 |
+
if len(summary_text) < 10:
|
189 |
+
summary_text = dict_res[str(i + 1)][0].get("snippet", "")
|
190 |
+
if len(summary_text) < 10:
|
191 |
+
summary_text = dict_res[str(i + 1)][0].get("title", "")
|
192 |
+
dict_res[str(i + 1)][0]["content_summary"] = utils.remove_image_keyword(summary_text)
|
193 |
+
kew_phares = []
|
194 |
+
dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
|
195 |
+
for j in range(len(dict_res[str(i + 1)])):
|
196 |
+
if "message" in dict_res[str(i + 1)][j]:
|
197 |
+
del dict_res[str(i + 1)][j]["message"]
|
198 |
+
return dict_res
|
199 |
+
|
200 |
+
|
201 |
+
def get_lang(docs):
|
202 |
+
lang_vi = 0
|
203 |
+
lang_en = 0
|
204 |
+
for d in docs:
|
205 |
+
if d.get("lang", "") == "vi":
|
206 |
+
lang_vi += 1
|
207 |
+
else:
|
208 |
+
lang_en += 1
|
209 |
+
if lang_vi >= lang_en:
|
210 |
+
return "vi"
|
211 |
+
return "en"
|
212 |
+
|
213 |
+
|
214 |
+
# def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field=''):
|
215 |
+
# global model, model_en
|
216 |
+
# docs = docs[:30000]
|
217 |
+
# lang = get_lang(docs)
|
218 |
+
# result = {}
|
219 |
+
# docs = segment(docs, lang=lang)
|
220 |
+
# if len(docs) < 2:
|
221 |
+
# return result
|
222 |
+
# if lang == "vi":
|
223 |
+
# features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
224 |
+
# vectors = model.encode(features, show_progress_bar=False)
|
225 |
+
# else:
|
226 |
+
# features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
227 |
+
# vectors = model_en.encode(features, show_progress_bar=False)
|
228 |
+
# clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
|
229 |
+
# linkage='single', distance_threshold=distance_threshold)
|
230 |
+
# clusteror.fit(vectors)
|
231 |
+
# print(clusteror.n_clusters_)
|
232 |
+
# for i in range(clusteror.n_clusters_):
|
233 |
+
# result[str(i + 1)] = []
|
234 |
+
# for i in range(len(clusteror.labels_)):
|
235 |
+
# cluster_no = clusteror.labels_[i]
|
236 |
+
# if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
|
237 |
+
# response_doc = {}
|
238 |
+
# response_doc = docs[i]
|
239 |
+
# if 'domain' in docs[i]:
|
240 |
+
# response_doc['domain'] = docs[i]['domain']
|
241 |
+
# if 'url' in docs[i]:
|
242 |
+
# response_doc['url'] = docs[i]['url']
|
243 |
+
# if 'title' in docs[i]:
|
244 |
+
# response_doc['title'] = clean_text(docs[i]['title'])
|
245 |
+
# if 'snippet' in docs[i]:
|
246 |
+
# response_doc['snippet'] = clean_text(docs[i]['snippet'])
|
247 |
+
# if 'created_time' in docs[i]:
|
248 |
+
# response_doc['created_time'] = docs[i]['created_time']
|
249 |
+
# if 'message' in docs[i]:
|
250 |
+
# title = docs[i].get('title','')
|
251 |
+
# snippet = docs[i].get('snippet','')
|
252 |
+
# message = docs[i].get('message','')
|
253 |
+
# if title.strip():
|
254 |
+
# split_mess = message.split(title)
|
255 |
+
# if len(split_mess) > 1:
|
256 |
+
# message = title.join(split_mess[1:])
|
257 |
+
# if snippet.strip():
|
258 |
+
# split_mess = message.split(snippet)
|
259 |
+
# if len(split_mess) > 1:
|
260 |
+
# message = snippet.join(split_mess[1:])
|
261 |
+
|
262 |
+
# response_doc['message'] = clean_text(message)
|
263 |
+
# if 'id' in docs[i]:
|
264 |
+
# response_doc['id'] = docs[i]['id']
|
265 |
+
# response_doc['score'] = 0.0
|
266 |
+
# response_doc['title_summarize'] = []
|
267 |
+
# response_doc['content_summary'] = ""
|
268 |
+
# response_doc['total_facebook_viral'] = 0
|
269 |
+
# result[str(cluster_no + 1)].append(response_doc)
|
270 |
+
|
271 |
+
# empty_clus_ids = []
|
272 |
+
# for x in result:
|
273 |
+
# result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
|
274 |
+
# if len( result[x]) > 0:
|
275 |
+
# if len(result[x]) > 1:
|
276 |
+
# result[x] = check_duplicate_title_domain(result[x])
|
277 |
+
# result[x][0]['num_docs'] = len(result[x])
|
278 |
+
# else:
|
279 |
+
# empty_clus_ids.append(x)
|
280 |
+
|
281 |
+
# for x in empty_clus_ids:
|
282 |
+
# result.pop(x,None)
|
283 |
+
# # result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
|
284 |
+
# return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field)
|
285 |
+
|
286 |
+
def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=True):
|
287 |
+
global model, model_en
|
288 |
+
docs = docs[:30000]
|
289 |
+
lang = get_lang(docs)
|
290 |
+
result = {}
|
291 |
+
cluster_score = {}
|
292 |
+
# docs = segment(docs, lang=lang)
|
293 |
+
if len(docs) < 2:
|
294 |
+
return result
|
295 |
+
if lang == "vi":
|
296 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
297 |
+
vectors = model.encode(features, show_progress_bar=False)
|
298 |
+
else:
|
299 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
300 |
+
vectors = model_en.encode(features, show_progress_bar=False)
|
301 |
+
clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
|
302 |
+
linkage='single', distance_threshold=distance_threshold)
|
303 |
+
clusteror.fit(vectors)
|
304 |
+
print(clusteror.n_clusters_)
|
305 |
+
for i in range(clusteror.n_clusters_):
|
306 |
+
result[str(i + 1)] = []
|
307 |
+
cluster_score[str(i + 1)] = 0
|
308 |
+
for i in range(len(clusteror.labels_)):
|
309 |
+
cluster_no = clusteror.labels_[i]
|
310 |
+
if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
|
311 |
+
response_doc = {}
|
312 |
+
response_doc = docs[i]
|
313 |
+
score = response_doc.get('score', 0)
|
314 |
+
if not docs[i].get('message','').strip():
|
315 |
+
continue
|
316 |
+
if score > cluster_score[str(cluster_no + 1)]:
|
317 |
+
cluster_score[str(cluster_no + 1)] = score
|
318 |
+
if 'domain' in docs[i]:
|
319 |
+
response_doc['domain'] = docs[i]['domain']
|
320 |
+
if 'url' in docs[i]:
|
321 |
+
response_doc['url'] = docs[i]['url']
|
322 |
+
if 'title' in docs[i]:
|
323 |
+
response_doc['title'] = clean_text(docs[i]['title'])
|
324 |
+
if 'snippet' in docs[i]:
|
325 |
+
response_doc['snippet'] = clean_text(docs[i]['snippet'])
|
326 |
+
if 'created_time' in docs[i]:
|
327 |
+
response_doc['created_time'] = docs[i]['created_time']
|
328 |
+
if 'message' in docs[i]:
|
329 |
+
title = docs[i].get('title','')
|
330 |
+
snippet = docs[i].get('snippet','')
|
331 |
+
message = docs[i].get('message','')
|
332 |
+
if title.strip():
|
333 |
+
split_mess = message.split(title)
|
334 |
+
if len(split_mess) > 1:
|
335 |
+
message = title.join(split_mess[1:])
|
336 |
+
if snippet.strip():
|
337 |
+
split_mess = message.split(snippet)
|
338 |
+
if len(split_mess) > 1:
|
339 |
+
message = snippet.join(split_mess[1:])
|
340 |
+
|
341 |
+
response_doc['message'] = clean_text(message)
|
342 |
+
if 'id' in docs[i]:
|
343 |
+
response_doc['id'] = docs[i]['id']
|
344 |
+
# response_doc['score'] = 0.0
|
345 |
+
response_doc['title_summarize'] = []
|
346 |
+
response_doc['content_summary'] = ""
|
347 |
+
response_doc['total_facebook_viral'] = 0
|
348 |
+
result[str(cluster_no + 1)].append(response_doc)
|
349 |
+
|
350 |
+
empty_clus_ids = []
|
351 |
+
for x in result:
|
352 |
+
result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
|
353 |
+
if len( result[x]) > 0:
|
354 |
+
if len(result[x]) > 1:
|
355 |
+
result[x] = check_duplicate_title_domain(result[x])
|
356 |
+
result[x][0]['num_docs'] = len(result[x])
|
357 |
+
result[x][0]['max_score'] = cluster_score[x]
|
358 |
+
else:
|
359 |
+
empty_clus_ids.append(x)
|
360 |
+
|
361 |
+
for x in empty_clus_ids:
|
362 |
+
result.pop(x,None)
|
363 |
+
# result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
|
364 |
+
return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster)
|
365 |
+
|
366 |
+
def check_duplicate_title_domain(docs):
|
367 |
+
lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
|
368 |
+
for i in range(1,len(lst_title_domain) -1):
|
369 |
+
for j in range(i+1,len(lst_title_domain)):
|
370 |
+
if lst_title_domain[j] == lst_title_domain[i]:
|
371 |
+
lst_title_domain[j] = 'dup'
|
372 |
+
lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
|
373 |
+
return lst_filter_docs
|
374 |
+
def convert_date(text):
|
375 |
+
text = text.replace(".", "/")
|
376 |
+
text = text.replace("-", "/")
|
377 |
+
return text
|
378 |
+
|
379 |
+
|
380 |
+
def check_keyword(sentence):
|
381 |
+
keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
|
382 |
+
for k in keyword:
|
383 |
+
if k in sentence:
|
384 |
+
return True
|
385 |
+
return False
|
386 |
+
|
387 |
+
|
388 |
+
def extract_events_and_time(docs, publish_date):
|
389 |
+
def standardize(date_str):
|
390 |
+
return date_str.replace('.', '/').replace('-', '/')
|
391 |
+
|
392 |
+
def add_0(date_str):
|
393 |
+
|
394 |
+
date_str = date_str.split('/')
|
395 |
+
res = []
|
396 |
+
for o in date_str:
|
397 |
+
o = re.sub('\s+', '', o)
|
398 |
+
if len(o) < 2:
|
399 |
+
o = '0' + o
|
400 |
+
res.append(o)
|
401 |
+
date_str = '/'.join(res)
|
402 |
+
return date_str
|
403 |
+
|
404 |
+
def get_date_list(reg, sentence):
|
405 |
+
find_object = re.finditer(reg, sentence)
|
406 |
+
date_list = [x.group() for x in find_object]
|
407 |
+
return date_list
|
408 |
+
|
409 |
+
year = publish_date.split('/')[2]
|
410 |
+
|
411 |
+
# dd/mm/yyyy
|
412 |
+
reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
|
413 |
+
# #mm/yyyy
|
414 |
+
# reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
|
415 |
+
# dd/mm
|
416 |
+
reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'
|
417 |
+
|
418 |
+
# ngày dd tháng mm năm yyyy
|
419 |
+
reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
|
420 |
+
# ngày dd tháng mm
|
421 |
+
reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'
|
422 |
+
|
423 |
+
result = []
|
424 |
+
for d in docs:
|
425 |
+
text = d['message']
|
426 |
+
for sentence in sent_tokenize(text):
|
427 |
+
lower_sentence = sentence.lower()
|
428 |
+
c = re.search(reg_exp_3, sentence.lower())
|
429 |
+
d = re.search(reg_exp_4, sentence.lower())
|
430 |
+
# e = re.search(reg_exp_5, sentence.lower())
|
431 |
+
a = re.search(reg_exp_1, sentence)
|
432 |
+
b = re.search(reg_exp_2, sentence)
|
433 |
+
#
|
434 |
+
if (a or b or c or d) and check_keyword(lower_sentence):
|
435 |
+
date_list = get_date_list(reg_exp_1, lower_sentence)
|
436 |
+
date_entity = ''
|
437 |
+
if date_list:
|
438 |
+
date_entity = add_0(standardize(date_list[0]))
|
439 |
+
elif get_date_list(reg_exp_2, lower_sentence):
|
440 |
+
date_list = get_date_list(reg_exp_2, lower_sentence)
|
441 |
+
date_entity = add_0(standardize(date_list[0]) + '/' + year)
|
442 |
+
elif get_date_list(reg_exp_3, lower_sentence):
|
443 |
+
date_list = get_date_list(reg_exp_3, lower_sentence)
|
444 |
+
|
445 |
+
date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
|
446 |
+
date_entity = re.sub('\s+', ' ', date_entity)
|
447 |
+
date_entity = date_entity.replace(' ', '/')
|
448 |
+
date_entity = add_0(date_entity)
|
449 |
+
else:
|
450 |
+
date_list = get_date_list(reg_exp_4, lower_sentence)
|
451 |
+
if date_list != []:
|
452 |
+
date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
|
453 |
+
date_entity = re.sub('\s+', ' ', date_entity)
|
454 |
+
date_entity = date_entity.replace(' ', '/')
|
455 |
+
date_entity = date_entity + '/' + year
|
456 |
+
date_entity = add_0(date_entity)
|
457 |
+
result.append((sentence, date_entity))
|
458 |
+
return result
|
function/topic_clustering_mnews.py
ADDED
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from vncorenlp import VnCoreNLP
|
3 |
+
from nltk.tokenize import sent_tokenize
|
4 |
+
import torch
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
import datetime
|
7 |
+
from sklearn.cluster import AgglomerativeClustering
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import requests
|
11 |
+
import json
|
12 |
+
from . import utils
|
13 |
+
import time
|
14 |
+
from summary import text_summary, get_summary_bert
|
15 |
+
# from . import detect_time as dt
|
16 |
+
|
17 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
18 |
+
model = SentenceTransformer('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base').to(device)
|
19 |
+
model_en = SentenceTransformer('paraphrase-mpnet-base-v2').to(device)
|
20 |
+
annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx8g')
|
21 |
+
|
22 |
+
|
23 |
+
def detect_postaging(text_in):
|
24 |
+
word_segmented_text = annotator.annotate(text_in)
|
25 |
+
lst_k = []
|
26 |
+
for se in word_segmented_text["sentences"]:
|
27 |
+
for kw in se:
|
28 |
+
if kw["posTag"] in ("Np", "Ny", "N"):
|
29 |
+
if kw["posTag"] == "N" and "_" not in kw["form"]:
|
30 |
+
continue
|
31 |
+
lst_k.append(kw["form"].replace("_", " "))
|
32 |
+
return list(set(lst_k))
|
33 |
+
|
34 |
+
def clean_text(text_in):
|
35 |
+
doc = re.sub('<.*?>', '', text_in)
|
36 |
+
doc = re.sub('(function).*}', ' ', doc)
|
37 |
+
# link
|
38 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
39 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
40 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
41 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
42 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
43 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
44 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
45 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
46 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
47 |
+
|
48 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
49 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
50 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
51 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
52 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
53 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
54 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
55 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
56 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
57 |
+
# escape sequence
|
58 |
+
doc = re.sub('\n', ' ', doc)
|
59 |
+
doc = re.sub('\t', ' ', doc)
|
60 |
+
doc = re.sub('\r', ' ', doc)
|
61 |
+
return doc
|
62 |
+
|
63 |
+
|
64 |
+
def data_cleaning(docs):
|
65 |
+
res = []
|
66 |
+
for d in docs:
|
67 |
+
if 'message' in d:
|
68 |
+
# css and js
|
69 |
+
doc = re.sub('<.*?>', '', d['message'])
|
70 |
+
doc = re.sub('(function).*}', ' ', doc)
|
71 |
+
|
72 |
+
# link
|
73 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
74 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
75 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
76 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
77 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
78 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
79 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
80 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
81 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
82 |
+
|
83 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
84 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
85 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
86 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
87 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
88 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
89 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
90 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
91 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
92 |
+
# escape sequence
|
93 |
+
doc = re.sub('\n', ' ', doc)
|
94 |
+
doc = re.sub('\t', ' ', doc)
|
95 |
+
doc = re.sub('\r', ' ', doc)
|
96 |
+
|
97 |
+
d['message'] = doc
|
98 |
+
res.append(d)
|
99 |
+
return res
|
100 |
+
|
101 |
+
|
102 |
+
def segment(docs, lang="vi"):
|
103 |
+
segmented_docs = []
|
104 |
+
for d in docs:
|
105 |
+
# if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
|
106 |
+
# continue
|
107 |
+
if 'snippet' not in d and 'title' not in d:
|
108 |
+
continue
|
109 |
+
try:
|
110 |
+
if lang == "vi":
|
111 |
+
snippet = d.get('snippet', "")
|
112 |
+
segmented_snippet = ""
|
113 |
+
segmented_sentences_snippet = annotator.tokenize(snippet)
|
114 |
+
for sentence in segmented_sentences_snippet:
|
115 |
+
segmented_snippet += ' ' + ' '.join(sentence)
|
116 |
+
segmented_snippet = segmented_snippet.replace('\xa0', '')
|
117 |
+
d['segmented_snippet'] = segmented_snippet
|
118 |
+
segmented_docs.append(d)
|
119 |
+
except Exception:
|
120 |
+
pass
|
121 |
+
return segmented_docs
|
122 |
+
|
123 |
+
|
124 |
+
def timestamp_to_date(timestamp):
|
125 |
+
return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')
|
126 |
+
|
127 |
+
|
128 |
+
def sort_content(lst_res):
|
129 |
+
lst_content = []
|
130 |
+
lst_cnt = []
|
131 |
+
for i in range(len(lst_res)):
|
132 |
+
lst_cnt.append(len(lst_res[i].get("message", "")))
|
133 |
+
id_sort = np.argsort(np.array(lst_cnt))[::-1]
|
134 |
+
for i in id_sort:
|
135 |
+
lst_content.append(lst_res[i])
|
136 |
+
return lst_content
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5):
|
141 |
+
lst_ids = []
|
142 |
+
lst_top = []
|
143 |
+
lst_res = []
|
144 |
+
for i in response:
|
145 |
+
lst_ids.append(i)
|
146 |
+
lst_top.append(len(response[i]))
|
147 |
+
idx = np.argsort(np.array(lst_top))[::-1]
|
148 |
+
if top_cluster == -1:
|
149 |
+
top_cluster = len(idx)
|
150 |
+
for i in idx[: top_cluster]:
|
151 |
+
ik = lst_ids[i]
|
152 |
+
if top_sentence == -1:
|
153 |
+
top_sentence = len(response[ik])
|
154 |
+
lst_check_title = []
|
155 |
+
lst_check_not_title = []
|
156 |
+
i_c_t = 0
|
157 |
+
response_sort = sort_content(response[ik].copy())
|
158 |
+
for resss in response_sort:
|
159 |
+
if resss.get("title", ""):
|
160 |
+
lst_check_title.append(resss)
|
161 |
+
i_c_t += 1
|
162 |
+
else:
|
163 |
+
lst_check_not_title.append(resss)
|
164 |
+
if i_c_t == top_sentence:
|
165 |
+
break
|
166 |
+
if i_c_t == top_sentence:
|
167 |
+
lst_res.append(lst_check_title)
|
168 |
+
else:
|
169 |
+
lst_check_title.extend(lst_check_not_title)
|
170 |
+
lst_res.append(lst_check_title[:top_sentence])
|
171 |
+
dict_res = {}
|
172 |
+
for i in range(len(lst_res)):
|
173 |
+
dict_res[str(i + 1)] = lst_res[i]
|
174 |
+
for j in range(min(len(dict_res[str(i + 1)]), 3)):
|
175 |
+
dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
|
176 |
+
summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), lang = dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary)
|
177 |
+
if len(summary_text) < 10:
|
178 |
+
summary_text = dict_res[str(i + 1)][0].get("snippet", "")
|
179 |
+
if len(summary_text) < 10:
|
180 |
+
summary_text = dict_res[str(i + 1)][0].get("title", "")
|
181 |
+
dict_res[str(i + 1)][0]["content_summary"] = utils.remove_image_keyword(summary_text)
|
182 |
+
kew_phares = []
|
183 |
+
dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
|
184 |
+
for j in range(len(dict_res[str(i + 1)])):
|
185 |
+
if "message" in dict_res[str(i + 1)][j]:
|
186 |
+
del dict_res[str(i + 1)][j]["message"]
|
187 |
+
return dict_res
|
188 |
+
|
189 |
+
|
190 |
+
def get_lang(docs):
|
191 |
+
lang_vi = 0
|
192 |
+
lang_en = 0
|
193 |
+
docs_lang_vi = []
|
194 |
+
docs_lang_en = []
|
195 |
+
for d in docs:
|
196 |
+
if d.get("lang", "") == "en":
|
197 |
+
lang_en += 1
|
198 |
+
docs_lang_en.append(d)
|
199 |
+
else:
|
200 |
+
lang_vi += 1
|
201 |
+
docs_lang_vi.append(d)
|
202 |
+
if lang_vi > lang_en:
|
203 |
+
return "vi", docs_lang_vi
|
204 |
+
return "en", docs_lang_en
|
205 |
+
|
206 |
+
|
207 |
+
def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, benchmark_id=1):
|
208 |
+
global model, model_en
|
209 |
+
lang, docs = get_lang(docs)
|
210 |
+
result = {}
|
211 |
+
docs = segment(docs, lang=lang)
|
212 |
+
print("docs segment: ", len(docs))
|
213 |
+
if len(docs) < 2:
|
214 |
+
return result
|
215 |
+
if lang == "vi":
|
216 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
217 |
+
vectors = model.encode(features, show_progress_bar=False)
|
218 |
+
else:
|
219 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
220 |
+
vectors = model_en.encode(features, show_progress_bar=False)
|
221 |
+
clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
|
222 |
+
linkage='single', distance_threshold=distance_threshold)
|
223 |
+
clusteror.fit(vectors)
|
224 |
+
print(clusteror.n_clusters_)
|
225 |
+
for i in range(clusteror.n_clusters_):
|
226 |
+
result[str(i + 1)] = []
|
227 |
+
for i in range(len(clusteror.labels_)):
|
228 |
+
cluster_no = clusteror.labels_[i]
|
229 |
+
response_doc = {}
|
230 |
+
if 'url' in docs[i]:
|
231 |
+
response_doc['url'] = docs[i]['url']
|
232 |
+
if 'domain' in docs[i]:
|
233 |
+
response_doc['domain'] = docs[i]['domain']
|
234 |
+
if 'title' in docs[i]:
|
235 |
+
response_doc['title'] = clean_text(docs[i]['title'])
|
236 |
+
if 'snippet' in docs[i]:
|
237 |
+
response_doc['snippet'] = clean_text(docs[i]['snippet'])
|
238 |
+
if 'created_time' in docs[i]:
|
239 |
+
response_doc['created_time'] = docs[i]['created_time']
|
240 |
+
if 'message' in docs[i]:
|
241 |
+
response_doc['message'] = clean_text(docs[i]['message'])
|
242 |
+
if 'id' in docs[i]:
|
243 |
+
response_doc['id'] = docs[i]['id']
|
244 |
+
response_doc['score'] = 0.0
|
245 |
+
response_doc['title_summarize'] = []
|
246 |
+
response_doc['content_summary'] = ""
|
247 |
+
response_doc['total_facebook_viral'] = 0
|
248 |
+
result[str(cluster_no + 1)].append(response_doc)
|
249 |
+
# print("before filter: ", len(result))
|
250 |
+
# result = smart_filter(result, benchmark_id=benchmark_id)
|
251 |
+
# print("after filter: ", len(result))
|
252 |
+
return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary)
|
253 |
+
|
254 |
+
|
255 |
+
def convert_date(text):
|
256 |
+
text = text.replace(".", "/")
|
257 |
+
text = text.replace("-", "/")
|
258 |
+
return text
|
259 |
+
|
260 |
+
|
261 |
+
def check_keyword(sentence):
|
262 |
+
keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
|
263 |
+
for k in keyword:
|
264 |
+
if k in sentence:
|
265 |
+
return True
|
266 |
+
return False
|
267 |
+
|
268 |
+
|
269 |
+
def extract_events_and_time(docs, publish_date):
|
270 |
+
def standardize(date_str):
|
271 |
+
return date_str.replace('.', '/').replace('-', '/')
|
272 |
+
|
273 |
+
def add_0(date_str):
|
274 |
+
|
275 |
+
date_str = date_str.split('/')
|
276 |
+
res = []
|
277 |
+
for o in date_str:
|
278 |
+
o = re.sub('\s+', '', o)
|
279 |
+
if len(o) < 2:
|
280 |
+
o = '0' + o
|
281 |
+
res.append(o)
|
282 |
+
date_str = '/'.join(res)
|
283 |
+
return date_str
|
284 |
+
|
285 |
+
def get_date_list(reg, sentence):
|
286 |
+
find_object = re.finditer(reg, sentence)
|
287 |
+
date_list = [x.group() for x in find_object]
|
288 |
+
return date_list
|
289 |
+
|
290 |
+
year = publish_date.split('/')[2]
|
291 |
+
|
292 |
+
# dd/mm/yyyy
|
293 |
+
reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
|
294 |
+
# #mm/yyyy
|
295 |
+
# reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
|
296 |
+
# dd/mm
|
297 |
+
reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'
|
298 |
+
|
299 |
+
# ngày dd tháng mm năm yyyy
|
300 |
+
reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
|
301 |
+
# ngày dd tháng mm
|
302 |
+
reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'
|
303 |
+
|
304 |
+
result = []
|
305 |
+
for d in docs:
|
306 |
+
text = d['message']
|
307 |
+
for sentence in sent_tokenize(text):
|
308 |
+
lower_sentence = sentence.lower()
|
309 |
+
c = re.search(reg_exp_3, sentence.lower())
|
310 |
+
d = re.search(reg_exp_4, sentence.lower())
|
311 |
+
# e = re.search(reg_exp_5, sentence.lower())
|
312 |
+
a = re.search(reg_exp_1, sentence)
|
313 |
+
b = re.search(reg_exp_2, sentence)
|
314 |
+
#
|
315 |
+
if (a or b or c or d) and check_keyword(lower_sentence):
|
316 |
+
date_list = get_date_list(reg_exp_1, lower_sentence)
|
317 |
+
date_entity = ''
|
318 |
+
if date_list:
|
319 |
+
date_entity = add_0(standardize(date_list[0]))
|
320 |
+
elif get_date_list(reg_exp_2, lower_sentence):
|
321 |
+
date_list = get_date_list(reg_exp_2, lower_sentence)
|
322 |
+
date_entity = add_0(standardize(date_list[0]) + '/' + year)
|
323 |
+
elif get_date_list(reg_exp_3, lower_sentence):
|
324 |
+
date_list = get_date_list(reg_exp_3, lower_sentence)
|
325 |
+
|
326 |
+
date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
|
327 |
+
date_entity = re.sub('\s+', ' ', date_entity)
|
328 |
+
date_entity = date_entity.replace(' ', '/')
|
329 |
+
date_entity = add_0(date_entity)
|
330 |
+
else:
|
331 |
+
date_list = get_date_list(reg_exp_4, lower_sentence)
|
332 |
+
if date_list != []:
|
333 |
+
date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
|
334 |
+
date_entity = re.sub('\s+', ' ', date_entity)
|
335 |
+
date_entity = date_entity.replace(' ', '/')
|
336 |
+
date_entity = date_entity + '/' + year
|
337 |
+
date_entity = add_0(date_entity)
|
338 |
+
result.append((sentence, date_entity))
|
339 |
+
return result
|
function/topic_clustering_not_summary.py
ADDED
@@ -0,0 +1,463 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from email import message
|
2 |
+
import re
|
3 |
+
from turtle import title
|
4 |
+
from vncorenlp import VnCoreNLP
|
5 |
+
from nltk.tokenize import sent_tokenize
|
6 |
+
import torch
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
import datetime
|
9 |
+
from sklearn.cluster import AgglomerativeClustering
|
10 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
11 |
+
import numpy as np
|
12 |
+
import requests
|
13 |
+
import json
|
14 |
+
from . import utils
|
15 |
+
import time
|
16 |
+
from summary import text_summary, get_summary_bert
|
17 |
+
from function.clean_text import normalize_text
|
18 |
+
# from . import detect_time as dt
|
19 |
+
|
20 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
21 |
+
# model = SentenceTransformer('model/distiluse-base-multilingual-cased-v2').to(device)
|
22 |
+
model = SentenceTransformer('model/paraphrase-multilingual-MiniLM-L12-v2')
|
23 |
+
|
24 |
+
# model = SentenceTransformer('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base').to(device)
|
25 |
+
# model.save('model/distiluse-base-multilingual-cased-v2')
|
26 |
+
|
27 |
+
use_cuda = torch.cuda.is_available()
|
28 |
+
print(torch.cuda.is_available())
|
29 |
+
if torch.cuda.is_available():
|
30 |
+
model_en = SentenceTransformer('model/paraphrase-mpnet-base-v2').to(device)
|
31 |
+
else:
|
32 |
+
model_en = model
|
33 |
+
# model_en.save('model/paraphrase-mpnet-base-v2')
|
34 |
+
annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx8g')
|
35 |
+
|
36 |
+
|
37 |
+
def detect_postaging(text_in):
|
38 |
+
word_segmented_text = annotator.annotate(text_in)
|
39 |
+
lst_k = []
|
40 |
+
for se in word_segmented_text["sentences"]:
|
41 |
+
for kw in se:
|
42 |
+
if kw["posTag"] in ("Np", "Ny", "N"):
|
43 |
+
if kw["posTag"] == "N" and "_" not in kw["form"]:
|
44 |
+
continue
|
45 |
+
lst_k.append(kw["form"].replace("_", " "))
|
46 |
+
return list(set(lst_k))
|
47 |
+
|
48 |
+
def clean_text(text_in):
|
49 |
+
doc = re.sub('<.*?>', '', text_in)
|
50 |
+
doc = re.sub('(function).*}', ' ', doc)
|
51 |
+
# link
|
52 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
53 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
54 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
55 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
56 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
57 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
58 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
59 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
60 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
61 |
+
|
62 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
63 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
64 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
65 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
66 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
67 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
68 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
69 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
70 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
71 |
+
# escape sequence
|
72 |
+
doc = re.sub('\n', ' ', doc)
|
73 |
+
doc = re.sub('\t', ' ', doc)
|
74 |
+
doc = re.sub('\r', ' ', doc)
|
75 |
+
|
76 |
+
doc = normalize_text(doc)
|
77 |
+
return doc
|
78 |
+
|
79 |
+
|
80 |
+
def data_cleaning(docs):
|
81 |
+
res = []
|
82 |
+
for d in docs:
|
83 |
+
if 'message' in d:
|
84 |
+
# css and js
|
85 |
+
doc = re.sub('<.*?>', '', d['message'])
|
86 |
+
doc = re.sub('(function).*}', ' ', doc)
|
87 |
+
|
88 |
+
# link
|
89 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
90 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
91 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
92 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
93 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
94 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
95 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
96 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
97 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
98 |
+
|
99 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
100 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
101 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
102 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
103 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
104 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
105 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
106 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
107 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
108 |
+
# escape sequence
|
109 |
+
doc = re.sub('\n', ' ', doc)
|
110 |
+
doc = re.sub('\t', ' ', doc)
|
111 |
+
doc = re.sub('\r', ' ', doc)
|
112 |
+
|
113 |
+
d['message'] = doc
|
114 |
+
res.append(d)
|
115 |
+
return res
|
116 |
+
|
117 |
+
|
118 |
+
def segment(docs, lang="vi"):
|
119 |
+
segmented_docs = []
|
120 |
+
for d in docs:
|
121 |
+
# if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
|
122 |
+
if len(d.get('message', "")) > 8000:
|
123 |
+
continue
|
124 |
+
if 'snippet' not in d:
|
125 |
+
continue
|
126 |
+
try:
|
127 |
+
if lang == "vi":
|
128 |
+
snippet = d.get('snippet', "")
|
129 |
+
segmented_snippet = ""
|
130 |
+
segmented_sentences_snippet = annotator.tokenize(snippet)
|
131 |
+
for sentence in segmented_sentences_snippet:
|
132 |
+
segmented_snippet += ' ' + ' '.join(sentence)
|
133 |
+
segmented_snippet = segmented_snippet.replace('\xa0', '')
|
134 |
+
d['segmented_snippet'] = segmented_snippet
|
135 |
+
segmented_docs.append(d)
|
136 |
+
except Exception:
|
137 |
+
pass
|
138 |
+
return segmented_docs
|
139 |
+
|
140 |
+
|
141 |
+
def timestamp_to_date(timestamp):
|
142 |
+
return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')
|
143 |
+
|
144 |
+
|
145 |
+
def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster = 50, delete_message=True):
|
146 |
+
print(f'[INFO] sorted_field: {sorted_field}')
|
147 |
+
MAX_DOC_PER_CLUSTER = max_doc_per_cluster
|
148 |
+
|
149 |
+
lst_ids = []
|
150 |
+
lst_top = []
|
151 |
+
lst_res = []
|
152 |
+
for i in response:
|
153 |
+
lst_ids.append(i)
|
154 |
+
|
155 |
+
if not sorted_field.strip():
|
156 |
+
lst_top.append(len(response[i]))
|
157 |
+
else:
|
158 |
+
lst_top.append(response[i][0]['max_score'])
|
159 |
+
|
160 |
+
idx = np.argsort(np.array(lst_top))[::-1]
|
161 |
+
if top_cluster == -1:
|
162 |
+
top_cluster = len(idx)
|
163 |
+
for i in idx[: top_cluster]:
|
164 |
+
ik = lst_ids[i]
|
165 |
+
if top_sentence == -1:
|
166 |
+
top_sentence = len(response[ik])
|
167 |
+
lst_check_title = []
|
168 |
+
lst_check_not_title = []
|
169 |
+
i_c_t = 0
|
170 |
+
for resss in response[ik]:
|
171 |
+
r_title = resss.get("title", "")
|
172 |
+
if r_title and not r_title.endswith("..."):
|
173 |
+
lst_check_title.append(resss)
|
174 |
+
i_c_t += 1
|
175 |
+
else:
|
176 |
+
lst_check_not_title.append(resss)
|
177 |
+
if i_c_t == top_sentence:
|
178 |
+
break
|
179 |
+
if i_c_t == top_sentence:
|
180 |
+
lst_res.append(lst_check_title)
|
181 |
+
else:
|
182 |
+
lst_check_title.extend(lst_check_not_title)
|
183 |
+
lst_res.append(lst_check_title[:top_sentence])
|
184 |
+
#lst_res.append(response[ik][:top_sentence])
|
185 |
+
dict_res = {}
|
186 |
+
for i in range(len(lst_res)):
|
187 |
+
dict_res[str(i + 1)] = lst_res[i][:MAX_DOC_PER_CLUSTER]
|
188 |
+
for j in range(min(len(dict_res[str(i + 1)]), 3)):
|
189 |
+
dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
|
190 |
+
summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), lang=get_summary_bert(dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary, title=dict_res[str(i + 1)][0].get("title", ""), snippet=dict_res[str(i + 1)][0].get("snippet", ""))
|
191 |
+
if len(summary_text) < 10:
|
192 |
+
summary_text = dict_res[str(i + 1)][0].get("snippet", "")
|
193 |
+
if len(summary_text) < 10:
|
194 |
+
summary_text = dict_res[str(i + 1)][0].get("title", "")
|
195 |
+
dict_res[str(i + 1)][0]["content_summary"] = utils.remove_image_keyword(summary_text)
|
196 |
+
kew_phares = []
|
197 |
+
dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
|
198 |
+
|
199 |
+
if delete_message:
|
200 |
+
for j in range(len(dict_res[str(i + 1)])):
|
201 |
+
if "message" in dict_res[str(i + 1)][j]:
|
202 |
+
del dict_res[str(i + 1)][j]["message"]
|
203 |
+
return dict_res
|
204 |
+
|
205 |
+
|
206 |
+
def get_lang(docs):
|
207 |
+
lang_vi = 0
|
208 |
+
lang_en = 0
|
209 |
+
for d in docs:
|
210 |
+
if d.get("lang", "") == "vi":
|
211 |
+
lang_vi += 1
|
212 |
+
else:
|
213 |
+
lang_en += 1
|
214 |
+
if lang_vi >= lang_en:
|
215 |
+
return "vi"
|
216 |
+
return "en"
|
217 |
+
|
218 |
+
|
219 |
+
# def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field=''):
|
220 |
+
# global model, model_en
|
221 |
+
# docs = docs[:30000]
|
222 |
+
# lang = get_lang(docs)
|
223 |
+
# result = {}
|
224 |
+
# docs = segment(docs, lang=lang)
|
225 |
+
# if len(docs) < 2:
|
226 |
+
# return result
|
227 |
+
# if lang == "vi":
|
228 |
+
# features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
229 |
+
# vectors = model.encode(features, show_progress_bar=False)
|
230 |
+
# else:
|
231 |
+
# features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
232 |
+
# vectors = model_en.encode(features, show_progress_bar=False)
|
233 |
+
# clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
|
234 |
+
# linkage='single', distance_threshold=distance_threshold)
|
235 |
+
# clusteror.fit(vectors)
|
236 |
+
# print(clusteror.n_clusters_)
|
237 |
+
# for i in range(clusteror.n_clusters_):
|
238 |
+
# result[str(i + 1)] = []
|
239 |
+
# for i in range(len(clusteror.labels_)):
|
240 |
+
# cluster_no = clusteror.labels_[i]
|
241 |
+
# if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
|
242 |
+
# response_doc = {}
|
243 |
+
# response_doc = docs[i]
|
244 |
+
# if 'domain' in docs[i]:
|
245 |
+
# response_doc['domain'] = docs[i]['domain']
|
246 |
+
# if 'url' in docs[i]:
|
247 |
+
# response_doc['url'] = docs[i]['url']
|
248 |
+
# if 'title' in docs[i]:
|
249 |
+
# response_doc['title'] = clean_text(docs[i]['title'])
|
250 |
+
# if 'snippet' in docs[i]:
|
251 |
+
# response_doc['snippet'] = clean_text(docs[i]['snippet'])
|
252 |
+
# if 'created_time' in docs[i]:
|
253 |
+
# response_doc['created_time'] = docs[i]['created_time']
|
254 |
+
# if 'message' in docs[i]:
|
255 |
+
# title = docs[i].get('title','')
|
256 |
+
# snippet = docs[i].get('snippet','')
|
257 |
+
# message = docs[i].get('message','')
|
258 |
+
# if title.strip():
|
259 |
+
# split_mess = message.split(title)
|
260 |
+
# if len(split_mess) > 1:
|
261 |
+
# message = title.join(split_mess[1:])
|
262 |
+
# if snippet.strip():
|
263 |
+
# split_mess = message.split(snippet)
|
264 |
+
# if len(split_mess) > 1:
|
265 |
+
# message = snippet.join(split_mess[1:])
|
266 |
+
|
267 |
+
# response_doc['message'] = clean_text(message)
|
268 |
+
# if 'id' in docs[i]:
|
269 |
+
# response_doc['id'] = docs[i]['id']
|
270 |
+
# response_doc['score'] = 0.0
|
271 |
+
# response_doc['title_summarize'] = []
|
272 |
+
# response_doc['content_summary'] = ""
|
273 |
+
# response_doc['total_facebook_viral'] = 0
|
274 |
+
# result[str(cluster_no + 1)].append(response_doc)
|
275 |
+
|
276 |
+
# empty_clus_ids = []
|
277 |
+
# for x in result:
|
278 |
+
# result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
|
279 |
+
# if len( result[x]) > 0:
|
280 |
+
# if len(result[x]) > 1:
|
281 |
+
# result[x] = check_duplicate_title_domain(result[x])
|
282 |
+
# result[x][0]['num_docs'] = len(result[x])
|
283 |
+
# else:
|
284 |
+
# empty_clus_ids.append(x)
|
285 |
+
|
286 |
+
# for x in empty_clus_ids:
|
287 |
+
# result.pop(x,None)
|
288 |
+
# # result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
|
289 |
+
# return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field)
|
290 |
+
|
291 |
+
def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=True):
|
292 |
+
global model, model_en
|
293 |
+
docs = docs[:30000]
|
294 |
+
lang = get_lang(docs)
|
295 |
+
result = {}
|
296 |
+
cluster_score = {}
|
297 |
+
docs = segment(docs, lang=lang)
|
298 |
+
if len(docs) < 2:
|
299 |
+
return result
|
300 |
+
if lang == "vi":
|
301 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
302 |
+
vectors = model.encode(features, show_progress_bar=False)
|
303 |
+
else:
|
304 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
305 |
+
vectors = model_en.encode(features, show_progress_bar=False)
|
306 |
+
clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
|
307 |
+
linkage='single', distance_threshold=distance_threshold)
|
308 |
+
clusteror.fit(vectors)
|
309 |
+
print(clusteror.n_clusters_)
|
310 |
+
for i in range(clusteror.n_clusters_):
|
311 |
+
result[str(i + 1)] = []
|
312 |
+
cluster_score[str(i + 1)] = 0
|
313 |
+
for i in range(len(clusteror.labels_)):
|
314 |
+
cluster_no = clusteror.labels_[i]
|
315 |
+
if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
|
316 |
+
response_doc = {}
|
317 |
+
response_doc = docs[i]
|
318 |
+
score = response_doc.get('score', 0)
|
319 |
+
if not docs[i].get('message','').strip():
|
320 |
+
continue
|
321 |
+
if score > cluster_score[str(cluster_no + 1)]:
|
322 |
+
cluster_score[str(cluster_no + 1)] = score
|
323 |
+
if 'domain' in docs[i]:
|
324 |
+
response_doc['domain'] = docs[i]['domain']
|
325 |
+
if 'url' in docs[i]:
|
326 |
+
response_doc['url'] = docs[i]['url']
|
327 |
+
if 'title' in docs[i]:
|
328 |
+
response_doc['title'] = clean_text(docs[i]['title'])
|
329 |
+
if 'snippet' in docs[i]:
|
330 |
+
response_doc['snippet'] = clean_text(docs[i]['snippet'])
|
331 |
+
if 'created_time' in docs[i]:
|
332 |
+
response_doc['created_time'] = docs[i]['created_time']
|
333 |
+
if 'message' in docs[i]:
|
334 |
+
title = docs[i].get('title','')
|
335 |
+
snippet = docs[i].get('snippet','')
|
336 |
+
message = docs[i].get('message','')
|
337 |
+
if title.strip():
|
338 |
+
split_mess = message.split(title)
|
339 |
+
if len(split_mess) > 1:
|
340 |
+
message = title.join(split_mess[1:])
|
341 |
+
if snippet.strip():
|
342 |
+
split_mess = message.split(snippet)
|
343 |
+
if len(split_mess) > 1:
|
344 |
+
message = snippet.join(split_mess[1:])
|
345 |
+
|
346 |
+
response_doc['message'] = clean_text(message)
|
347 |
+
if 'id' in docs[i]:
|
348 |
+
response_doc['id'] = docs[i]['id']
|
349 |
+
# response_doc['score'] = 0.0
|
350 |
+
response_doc['title_summarize'] = []
|
351 |
+
response_doc['content_summary'] = ""
|
352 |
+
response_doc['total_facebook_viral'] = 0
|
353 |
+
result[str(cluster_no + 1)].append(response_doc)
|
354 |
+
|
355 |
+
empty_clus_ids = []
|
356 |
+
for x in result:
|
357 |
+
result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
|
358 |
+
if len( result[x]) > 0:
|
359 |
+
if len(result[x]) > 1:
|
360 |
+
result[x] = check_duplicate_title_domain(result[x])
|
361 |
+
result[x][0]['num_docs'] = len(result[x])
|
362 |
+
result[x][0]['max_score'] = cluster_score[x]
|
363 |
+
else:
|
364 |
+
empty_clus_ids.append(x)
|
365 |
+
|
366 |
+
for x in empty_clus_ids:
|
367 |
+
result.pop(x,None)
|
368 |
+
# result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
|
369 |
+
return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=delete_message)
|
370 |
+
|
371 |
+
def check_duplicate_title_domain(docs):
|
372 |
+
lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
|
373 |
+
for i in range(1,len(lst_title_domain) -1):
|
374 |
+
for j in range(i+1,len(lst_title_domain)):
|
375 |
+
if lst_title_domain[j] == lst_title_domain[i]:
|
376 |
+
lst_title_domain[j] = 'dup'
|
377 |
+
lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
|
378 |
+
return lst_filter_docs
|
379 |
+
def convert_date(text):
|
380 |
+
text = text.replace(".", "/")
|
381 |
+
text = text.replace("-", "/")
|
382 |
+
return text
|
383 |
+
|
384 |
+
|
385 |
+
def check_keyword(sentence):
|
386 |
+
keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
|
387 |
+
for k in keyword:
|
388 |
+
if k in sentence:
|
389 |
+
return True
|
390 |
+
return False
|
391 |
+
|
392 |
+
|
393 |
+
def extract_events_and_time(docs, publish_date):
|
394 |
+
def standardize(date_str):
|
395 |
+
return date_str.replace('.', '/').replace('-', '/')
|
396 |
+
|
397 |
+
def add_0(date_str):
|
398 |
+
|
399 |
+
date_str = date_str.split('/')
|
400 |
+
res = []
|
401 |
+
for o in date_str:
|
402 |
+
o = re.sub('\s+', '', o)
|
403 |
+
if len(o) < 2:
|
404 |
+
o = '0' + o
|
405 |
+
res.append(o)
|
406 |
+
date_str = '/'.join(res)
|
407 |
+
return date_str
|
408 |
+
|
409 |
+
def get_date_list(reg, sentence):
|
410 |
+
find_object = re.finditer(reg, sentence)
|
411 |
+
date_list = [x.group() for x in find_object]
|
412 |
+
return date_list
|
413 |
+
|
414 |
+
year = publish_date.split('/')[2]
|
415 |
+
|
416 |
+
# dd/mm/yyyy
|
417 |
+
reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
|
418 |
+
# #mm/yyyy
|
419 |
+
# reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
|
420 |
+
# dd/mm
|
421 |
+
reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'
|
422 |
+
|
423 |
+
# ngày dd tháng mm năm yyyy
|
424 |
+
reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
|
425 |
+
# ngày dd tháng mm
|
426 |
+
reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'
|
427 |
+
|
428 |
+
result = []
|
429 |
+
for d in docs:
|
430 |
+
text = d['message']
|
431 |
+
for sentence in sent_tokenize(text):
|
432 |
+
lower_sentence = sentence.lower()
|
433 |
+
c = re.search(reg_exp_3, sentence.lower())
|
434 |
+
d = re.search(reg_exp_4, sentence.lower())
|
435 |
+
# e = re.search(reg_exp_5, sentence.lower())
|
436 |
+
a = re.search(reg_exp_1, sentence)
|
437 |
+
b = re.search(reg_exp_2, sentence)
|
438 |
+
#
|
439 |
+
if (a or b or c or d) and check_keyword(lower_sentence):
|
440 |
+
date_list = get_date_list(reg_exp_1, lower_sentence)
|
441 |
+
date_entity = ''
|
442 |
+
if date_list:
|
443 |
+
date_entity = add_0(standardize(date_list[0]))
|
444 |
+
elif get_date_list(reg_exp_2, lower_sentence):
|
445 |
+
date_list = get_date_list(reg_exp_2, lower_sentence)
|
446 |
+
date_entity = add_0(standardize(date_list[0]) + '/' + year)
|
447 |
+
elif get_date_list(reg_exp_3, lower_sentence):
|
448 |
+
date_list = get_date_list(reg_exp_3, lower_sentence)
|
449 |
+
|
450 |
+
date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
|
451 |
+
date_entity = re.sub('\s+', ' ', date_entity)
|
452 |
+
date_entity = date_entity.replace(' ', '/')
|
453 |
+
date_entity = add_0(date_entity)
|
454 |
+
else:
|
455 |
+
date_list = get_date_list(reg_exp_4, lower_sentence)
|
456 |
+
if date_list != []:
|
457 |
+
date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
|
458 |
+
date_entity = re.sub('\s+', ' ', date_entity)
|
459 |
+
date_entity = date_entity.replace(' ', '/')
|
460 |
+
date_entity = date_entity + '/' + year
|
461 |
+
date_entity = add_0(date_entity)
|
462 |
+
result.append((sentence, date_entity))
|
463 |
+
return result
|
function/topic_clustering_social.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import time
|
3 |
+
from .utils import get_sbert_embedding, clean_text
|
4 |
+
from sklearn.cluster import AgglomerativeClustering
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
from nltk import sent_tokenize
|
7 |
+
import requests
|
8 |
+
# from clean_text import normalize_text
|
9 |
+
|
10 |
+
MAX_LENGTH_FEATURE = 250
|
11 |
+
MIN_LENGTH_FEATURE = 100
|
12 |
+
URL_CHECK_SPAM = "http://10.9.3.70:30036/predict"
|
13 |
+
|
14 |
+
def check_spam(docs):
|
15 |
+
json_body = {
|
16 |
+
"domain_id": "",
|
17 |
+
"records": [
|
18 |
+
{
|
19 |
+
"text": doc.get("message",""),
|
20 |
+
"idxcol": 1
|
21 |
+
} for doc in docs
|
22 |
+
]
|
23 |
+
}
|
24 |
+
|
25 |
+
result = requests.post(URL_CHECK_SPAM, json = json_body).json()
|
26 |
+
docs = [x for i,x in enumerate(docs) if result[i]["label"] == 0]
|
27 |
+
return docs
|
28 |
+
|
29 |
+
def preocess_feature(doc):
|
30 |
+
message = doc.get("message","")
|
31 |
+
paras = message.split("\n")
|
32 |
+
feature = ""
|
33 |
+
paras = [clean_text(x.strip(), normalize=False) for x in paras if x.strip() and len(x.strip()) > 10]
|
34 |
+
for para in paras:
|
35 |
+
if len(feature) + len(para) < MAX_LENGTH_FEATURE:
|
36 |
+
feature += " " +para
|
37 |
+
elif len(feature) < MIN_LENGTH_FEATURE:
|
38 |
+
sens = sent_tokenize(para)
|
39 |
+
for sen in sens:
|
40 |
+
if len(feature) + len(sen) < MAX_LENGTH_FEATURE or len(feature.strip()) < MIN_LENGTH_FEATURE:
|
41 |
+
feature += " " +sen
|
42 |
+
return feature
|
43 |
+
|
44 |
+
def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=True, is_check_spam = True):
|
45 |
+
# global model, model_en
|
46 |
+
|
47 |
+
docs = [x for x in docs if len(x.get("message","")) > 100]
|
48 |
+
docs = docs[:30000]
|
49 |
+
if is_check_spam:
|
50 |
+
docs = check_spam(docs)
|
51 |
+
result = {}
|
52 |
+
cluster_score = {}
|
53 |
+
|
54 |
+
t1 = time.time()
|
55 |
+
if len(docs) < 1:
|
56 |
+
return result
|
57 |
+
elif len(docs) == 1:
|
58 |
+
return {
|
59 |
+
"0": docs
|
60 |
+
}
|
61 |
+
|
62 |
+
# features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
63 |
+
|
64 |
+
f_docs = []
|
65 |
+
for x in docs:
|
66 |
+
ft = preocess_feature(x)
|
67 |
+
if len(ft) > MIN_LENGTH_FEATURE:
|
68 |
+
x["title"] = ft
|
69 |
+
f_docs.append(x)
|
70 |
+
docs = f_docs
|
71 |
+
|
72 |
+
features = [x["title"] for x in docs ]
|
73 |
+
# with open("feature", 'w') as f:
|
74 |
+
# json.dump(features, f, ensure_ascii = False)
|
75 |
+
# print(features)
|
76 |
+
vectors = get_sbert_embedding(features)
|
77 |
+
|
78 |
+
clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
|
79 |
+
linkage='complete', distance_threshold=distance_threshold)
|
80 |
+
clusteror.fit(vectors)
|
81 |
+
print(f"Time encode + clustering: {time.time() - t1} {clusteror.n_clusters_}")
|
82 |
+
for i in range(clusteror.n_clusters_):
|
83 |
+
result[str(i + 1)] = []
|
84 |
+
cluster_score[str(i + 1)] = 0
|
85 |
+
for i in range(len(clusteror.labels_)):
|
86 |
+
cluster_no = clusteror.labels_[i]
|
87 |
+
if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
|
88 |
+
response_doc = {}
|
89 |
+
response_doc = docs[i]
|
90 |
+
score = response_doc.get('score', 0)
|
91 |
+
if not docs[i].get('message','').strip():
|
92 |
+
continue
|
93 |
+
if score > cluster_score[str(cluster_no + 1)]:
|
94 |
+
cluster_score[str(cluster_no + 1)] = score
|
95 |
+
if 'domain' in docs[i]:
|
96 |
+
response_doc['domain'] = docs[i]['domain']
|
97 |
+
if 'url' in docs[i]:
|
98 |
+
response_doc['url'] = docs[i]['url']
|
99 |
+
if 'title' in docs[i]:
|
100 |
+
response_doc['title'] = clean_text(docs[i]['title'])
|
101 |
+
if 'snippet' in docs[i]:
|
102 |
+
response_doc['snippet'] = clean_text(docs[i]['snippet'])
|
103 |
+
if 'created_time' in docs[i]:
|
104 |
+
response_doc['created_time'] = docs[i]['created_time']
|
105 |
+
if "sentiment" in docs[i]:
|
106 |
+
response_doc['sentiment'] = docs[i]['sentiment']
|
107 |
+
if 'message' in docs[i]:
|
108 |
+
title = docs[i].get('title','')
|
109 |
+
snippet = docs[i].get('snippet','')
|
110 |
+
message = docs[i].get('message','')
|
111 |
+
# if title.strip():
|
112 |
+
# split_mess = message.split(title)
|
113 |
+
# if len(split_mess) > 1:
|
114 |
+
# message = title.join(split_mess[1:])
|
115 |
+
# if snippet.strip():
|
116 |
+
# split_mess = message.split(snippet)
|
117 |
+
# if len(split_mess) > 1:
|
118 |
+
# message = snippet.join(split_mess[1:])
|
119 |
+
|
120 |
+
response_doc['message'] = clean_text(message)
|
121 |
+
if 'id' in docs[i]:
|
122 |
+
response_doc['id'] = docs[i]['id']
|
123 |
+
# response_doc['score'] = 0.0
|
124 |
+
|
125 |
+
# response_doc['title_summarize'] = []
|
126 |
+
# response_doc['content_summary'] = ""
|
127 |
+
# response_doc['total_facebook_viral'] = 0
|
128 |
+
result[str(cluster_no + 1)].append(response_doc)
|
129 |
+
|
130 |
+
empty_clus_ids = []
|
131 |
+
for x in result:
|
132 |
+
result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
|
133 |
+
if len( result[x]) > 0:
|
134 |
+
# if len(result[x]) > 1:
|
135 |
+
# result[x] = check_duplicate_title_domain(result[x])
|
136 |
+
result[x][0]['num_docs'] = len(result[x])
|
137 |
+
result[x][0]['max_score'] = cluster_score[x]
|
138 |
+
else:
|
139 |
+
empty_clus_ids.append(x)
|
140 |
+
|
141 |
+
for x in empty_clus_ids:
|
142 |
+
result.pop(x,None)
|
143 |
+
|
144 |
+
result = dict( sorted(result.items(), key=lambda i: -len(i[1]))[:top_cluster])
|
145 |
+
return result
|
146 |
+
# return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=delete_message)
|
147 |
+
|
148 |
+
if __name__ == '__main__':
|
149 |
+
# with open("/home2/vietle/DA-Report/social.json", 'r') as f:
|
150 |
+
# docs = json.load(f)[:2000]
|
151 |
+
with open("/home2/vietle/news-cms/topic_summarization/data/news_cms.social.json", 'r') as f:
|
152 |
+
docs = json.load(f)[:10000]
|
153 |
+
clusters = topic_clustering(docs, distance_threshold=0.2, top_cluster=5000, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=False)
|
154 |
+
with open("/home2/vietle/news-cms/topic_summarization/cluster/news_cms.social.json", 'w') as f:
|
155 |
+
|
156 |
+
json.dump(clusters,f, ensure_ascii =False)
|
function/topic_clustering_v2.py
ADDED
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tensorRT import inference
|
2 |
+
import re
|
3 |
+
from vncorenlp import VnCoreNLP
|
4 |
+
from nltk.tokenize import sent_tokenize
|
5 |
+
import torch
|
6 |
+
import datetime
|
7 |
+
from sklearn.cluster import AgglomerativeClustering
|
8 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
+
import numpy as np
|
10 |
+
import json
|
11 |
+
from . import utils
|
12 |
+
import time
|
13 |
+
from summary import text_summary, get_summary_bert
|
14 |
+
from function.clean_text import normalize_text
|
15 |
+
|
16 |
+
# from . import detect_time as dt
|
17 |
+
|
18 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
19 |
+
use_cuda = torch.cuda.is_available()
|
20 |
+
print(torch.cuda.is_available())
|
21 |
+
|
22 |
+
# annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx2g')
|
23 |
+
|
24 |
+
|
25 |
+
def detect_postaging(text_in):
|
26 |
+
word_segmented_text = annotator.annotate(text_in)
|
27 |
+
lst_k = []
|
28 |
+
for se in word_segmented_text["sentences"]:
|
29 |
+
for kw in se:
|
30 |
+
if kw["posTag"] in ("Np", "Ny", "N"):
|
31 |
+
if kw["posTag"] == "N" and "_" not in kw["form"]:
|
32 |
+
continue
|
33 |
+
lst_k.append(kw["form"].replace("_", " "))
|
34 |
+
return list(set(lst_k))
|
35 |
+
|
36 |
+
def clean_text(text_in):
|
37 |
+
doc = re.sub('<.*?>', '', text_in)
|
38 |
+
doc = re.sub('(function).*}', ' ', doc)
|
39 |
+
# link
|
40 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
41 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
42 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
43 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
44 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
45 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
46 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
47 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
48 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
49 |
+
|
50 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
51 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
52 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
53 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
54 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
55 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
56 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
57 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
58 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
59 |
+
# escape sequence
|
60 |
+
doc = re.sub('\n', ' ', doc)
|
61 |
+
doc = re.sub('\t', ' ', doc)
|
62 |
+
doc = re.sub('\r', ' ', doc)
|
63 |
+
|
64 |
+
doc = normalize_text(doc)
|
65 |
+
return doc
|
66 |
+
|
67 |
+
|
68 |
+
def data_cleaning(docs):
|
69 |
+
res = []
|
70 |
+
for d in docs:
|
71 |
+
if 'message' in d:
|
72 |
+
# css and js
|
73 |
+
doc = re.sub('<.*?>', '', d['message'])
|
74 |
+
doc = re.sub('(function).*}', ' ', doc)
|
75 |
+
|
76 |
+
# link
|
77 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
78 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
79 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
80 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
81 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
82 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
83 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
84 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
85 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
86 |
+
|
87 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
88 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
89 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
90 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
91 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
92 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
93 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
94 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
95 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
96 |
+
# escape sequence
|
97 |
+
doc = re.sub('\n', ' ', doc)
|
98 |
+
doc = re.sub('\t', ' ', doc)
|
99 |
+
doc = re.sub('\r', ' ', doc)
|
100 |
+
|
101 |
+
d['message'] = doc
|
102 |
+
res.append(d)
|
103 |
+
return res
|
104 |
+
|
105 |
+
|
106 |
+
def segment(docs, lang="vi"):
|
107 |
+
segmented_docs = []
|
108 |
+
for d in docs:
|
109 |
+
print(d)
|
110 |
+
# if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
|
111 |
+
if len(d.get('message', "")) > 8000:
|
112 |
+
continue
|
113 |
+
if 'snippet' not in d:
|
114 |
+
continue
|
115 |
+
try:
|
116 |
+
if lang == "vi":
|
117 |
+
snippet = d.get('snippet', "")
|
118 |
+
segmented_snippet = ""
|
119 |
+
segmented_sentences_snippet = annotator.tokenize(snippet)
|
120 |
+
for sentence in segmented_sentences_snippet:
|
121 |
+
segmented_snippet += ' ' + ' '.join(sentence)
|
122 |
+
segmented_snippet = segmented_snippet.replace('\xa0', '')
|
123 |
+
d['segmented_snippet'] = segmented_snippet
|
124 |
+
segmented_docs.append(d)
|
125 |
+
except Exception:
|
126 |
+
pass
|
127 |
+
return segmented_docs
|
128 |
+
|
129 |
+
|
130 |
+
def timestamp_to_date(timestamp):
|
131 |
+
return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')
|
132 |
+
|
133 |
+
|
134 |
+
def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster = 50, delete_message=True):
|
135 |
+
print(f'[INFO] sorted_field: {sorted_field}')
|
136 |
+
MAX_DOC_PER_CLUSTER = max_doc_per_cluster
|
137 |
+
|
138 |
+
lst_ids = []
|
139 |
+
lst_top = []
|
140 |
+
lst_res = []
|
141 |
+
for i in response:
|
142 |
+
lst_ids.append(i)
|
143 |
+
|
144 |
+
if not sorted_field.strip():
|
145 |
+
lst_top.append(len(response[i]))
|
146 |
+
else:
|
147 |
+
lst_top.append(response[i][0]['max_score'])
|
148 |
+
|
149 |
+
idx = np.argsort(np.array(lst_top))[::-1]
|
150 |
+
if top_cluster == -1:
|
151 |
+
top_cluster = len(idx)
|
152 |
+
for i in idx[: top_cluster]:
|
153 |
+
ik = lst_ids[i]
|
154 |
+
if top_sentence == -1:
|
155 |
+
top_sentence = len(response[ik])
|
156 |
+
lst_check_title = []
|
157 |
+
lst_check_not_title = []
|
158 |
+
i_c_t = 0
|
159 |
+
for resss in response[ik]:
|
160 |
+
r_title = resss.get("title", "")
|
161 |
+
if r_title and not r_title.endswith("..."):
|
162 |
+
lst_check_title.append(resss)
|
163 |
+
i_c_t += 1
|
164 |
+
else:
|
165 |
+
lst_check_not_title.append(resss)
|
166 |
+
if i_c_t == top_sentence:
|
167 |
+
break
|
168 |
+
if i_c_t == top_sentence:
|
169 |
+
lst_res.append(lst_check_title)
|
170 |
+
else:
|
171 |
+
lst_check_title.extend(lst_check_not_title)
|
172 |
+
lst_res.append(lst_check_title[:top_sentence])
|
173 |
+
#lst_res.append(response[ik][:top_sentence])
|
174 |
+
dict_res = {}
|
175 |
+
for i in range(len(lst_res)):
|
176 |
+
dict_res[str(i + 1)] = lst_res[i][:MAX_DOC_PER_CLUSTER]
|
177 |
+
for j in range(min(len(dict_res[str(i + 1)]), 3)):
|
178 |
+
dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
|
179 |
+
summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary, title=dict_res[str(i + 1)][0].get("title", ""), snippet=dict_res[str(i + 1)][0].get("snippet", ""))
|
180 |
+
if len(summary_text) < 10:
|
181 |
+
summary_text = dict_res[str(i + 1)][0].get("snippet", "")
|
182 |
+
if len(summary_text) < 10:
|
183 |
+
summary_text = dict_res[str(i + 1)][0].get("title", "")
|
184 |
+
dict_res[str(i + 1)][0]["content_summary"] = utils.remove_image_keyword(summary_text)
|
185 |
+
kew_phares = []
|
186 |
+
dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
|
187 |
+
|
188 |
+
print("delete_message: ", delete_message)
|
189 |
+
if delete_message:
|
190 |
+
for j in range(len(dict_res[str(i + 1)])):
|
191 |
+
if "message" in dict_res[str(i + 1)][j]:
|
192 |
+
del dict_res[str(i + 1)][j]["message"]
|
193 |
+
return dict_res
|
194 |
+
|
195 |
+
|
196 |
+
def get_lang(docs):
|
197 |
+
lang_vi = 0
|
198 |
+
lang_en = 0
|
199 |
+
for d in docs:
|
200 |
+
if d.get("lang", "") == "vi":
|
201 |
+
lang_vi += 1
|
202 |
+
else:
|
203 |
+
lang_en += 1
|
204 |
+
if lang_en >= lang_vi:
|
205 |
+
return "en"
|
206 |
+
return "vi"
|
207 |
+
|
208 |
+
def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=True):
|
209 |
+
# global model, model_en
|
210 |
+
docs = docs[:30000]
|
211 |
+
lang = get_lang(docs)
|
212 |
+
result = {}
|
213 |
+
cluster_score = {}
|
214 |
+
# docs = segment(docs, lang=lang)
|
215 |
+
|
216 |
+
t1 = time.time()
|
217 |
+
if len(docs) < 1:
|
218 |
+
return result
|
219 |
+
elif len(docs) == 1:
|
220 |
+
return {
|
221 |
+
"0": docs
|
222 |
+
}
|
223 |
+
if lang == "vi":
|
224 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
225 |
+
vectors = inference.encode(features, lang=lang)
|
226 |
+
# vectors = model.encode(features, show_progress_bar=False)
|
227 |
+
else:
|
228 |
+
features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
|
229 |
+
vectors = inference.encode(features, lang=lang)
|
230 |
+
# vectors = model_en.encode(features, show_progress_bar=False)
|
231 |
+
clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
|
232 |
+
linkage='single', distance_threshold=distance_threshold)
|
233 |
+
clusteror.fit(vectors)
|
234 |
+
print(f"Time encode + clustering: {time.time() - t1} {clusteror.n_clusters_}")
|
235 |
+
for i in range(clusteror.n_clusters_):
|
236 |
+
result[str(i + 1)] = []
|
237 |
+
cluster_score[str(i + 1)] = 0
|
238 |
+
for i in range(len(clusteror.labels_)):
|
239 |
+
cluster_no = clusteror.labels_[i]
|
240 |
+
if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
|
241 |
+
response_doc = {}
|
242 |
+
response_doc = docs[i]
|
243 |
+
score = response_doc.get('score', 0)
|
244 |
+
if not docs[i].get('message','').strip():
|
245 |
+
continue
|
246 |
+
if score > cluster_score[str(cluster_no + 1)]:
|
247 |
+
cluster_score[str(cluster_no + 1)] = score
|
248 |
+
if 'domain' in docs[i]:
|
249 |
+
response_doc['domain'] = docs[i]['domain']
|
250 |
+
if 'url' in docs[i]:
|
251 |
+
response_doc['url'] = docs[i]['url']
|
252 |
+
if 'title' in docs[i]:
|
253 |
+
response_doc['title'] = clean_text(docs[i]['title'])
|
254 |
+
if 'snippet' in docs[i]:
|
255 |
+
response_doc['snippet'] = clean_text(docs[i]['snippet'])
|
256 |
+
if 'created_time' in docs[i]:
|
257 |
+
response_doc['created_time'] = docs[i]['created_time']
|
258 |
+
if "sentiment" in docs[i]:
|
259 |
+
response_doc['sentiment'] = docs[i]['sentiment']
|
260 |
+
if 'message' in docs[i]:
|
261 |
+
title = docs[i].get('title','')
|
262 |
+
snippet = docs[i].get('snippet','')
|
263 |
+
message = docs[i].get('message','')
|
264 |
+
if title.strip():
|
265 |
+
split_mess = message.split(title)
|
266 |
+
if len(split_mess) > 1:
|
267 |
+
message = title.join(split_mess[1:])
|
268 |
+
if snippet.strip():
|
269 |
+
split_mess = message.split(snippet)
|
270 |
+
if len(split_mess) > 1:
|
271 |
+
message = snippet.join(split_mess[1:])
|
272 |
+
|
273 |
+
response_doc['message'] = clean_text(message)
|
274 |
+
if 'id' in docs[i]:
|
275 |
+
response_doc['id'] = docs[i]['id']
|
276 |
+
# response_doc['score'] = 0.0
|
277 |
+
response_doc['title_summarize'] = []
|
278 |
+
response_doc['content_summary'] = ""
|
279 |
+
response_doc['total_facebook_viral'] = 0
|
280 |
+
result[str(cluster_no + 1)].append(response_doc)
|
281 |
+
|
282 |
+
empty_clus_ids = []
|
283 |
+
for x in result:
|
284 |
+
result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
|
285 |
+
if len( result[x]) > 0:
|
286 |
+
if len(result[x]) > 1:
|
287 |
+
result[x] = check_duplicate_title_domain(result[x])
|
288 |
+
result[x][0]['num_docs'] = len(result[x])
|
289 |
+
result[x][0]['max_score'] = cluster_score[x]
|
290 |
+
else:
|
291 |
+
empty_clus_ids.append(x)
|
292 |
+
|
293 |
+
for x in empty_clus_ids:
|
294 |
+
result.pop(x,None)
|
295 |
+
# result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
|
296 |
+
return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=delete_message)
|
297 |
+
|
298 |
+
def check_duplicate_title_domain(docs):
|
299 |
+
lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
|
300 |
+
for i in range(1,len(lst_title_domain) -1):
|
301 |
+
for j in range(i+1,len(lst_title_domain)):
|
302 |
+
if lst_title_domain[j] == lst_title_domain[i]:
|
303 |
+
lst_title_domain[j] = 'dup'
|
304 |
+
lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
|
305 |
+
return lst_filter_docs
|
306 |
+
def convert_date(text):
|
307 |
+
text = text.replace(".", "/")
|
308 |
+
text = text.replace("-", "/")
|
309 |
+
return text
|
310 |
+
|
311 |
+
|
312 |
+
def check_keyword(sentence):
|
313 |
+
keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
|
314 |
+
for k in keyword:
|
315 |
+
if k in sentence:
|
316 |
+
return True
|
317 |
+
return False
|
318 |
+
|
319 |
+
|
320 |
+
def extract_events_and_time(docs, publish_date):
|
321 |
+
def standardize(date_str):
|
322 |
+
return date_str.replace('.', '/').replace('-', '/')
|
323 |
+
|
324 |
+
def add_0(date_str):
|
325 |
+
|
326 |
+
date_str = date_str.split('/')
|
327 |
+
res = []
|
328 |
+
for o in date_str:
|
329 |
+
o = re.sub('\s+', '', o)
|
330 |
+
if len(o) < 2:
|
331 |
+
o = '0' + o
|
332 |
+
res.append(o)
|
333 |
+
date_str = '/'.join(res)
|
334 |
+
return date_str
|
335 |
+
|
336 |
+
def get_date_list(reg, sentence):
|
337 |
+
find_object = re.finditer(reg, sentence)
|
338 |
+
date_list = [x.group() for x in find_object]
|
339 |
+
return date_list
|
340 |
+
|
341 |
+
year = publish_date.split('/')[2]
|
342 |
+
|
343 |
+
# dd/mm/yyyy
|
344 |
+
reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
|
345 |
+
# #mm/yyyy
|
346 |
+
# reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
|
347 |
+
# dd/mm
|
348 |
+
reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'
|
349 |
+
|
350 |
+
# ngày dd tháng mm năm yyyy
|
351 |
+
reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
|
352 |
+
# ngày dd tháng mm
|
353 |
+
reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'
|
354 |
+
|
355 |
+
result = []
|
356 |
+
for d in docs:
|
357 |
+
text = d['message']
|
358 |
+
for sentence in sent_tokenize(text):
|
359 |
+
lower_sentence = sentence.lower()
|
360 |
+
c = re.search(reg_exp_3, sentence.lower())
|
361 |
+
d = re.search(reg_exp_4, sentence.lower())
|
362 |
+
# e = re.search(reg_exp_5, sentence.lower())
|
363 |
+
a = re.search(reg_exp_1, sentence)
|
364 |
+
b = re.search(reg_exp_2, sentence)
|
365 |
+
#
|
366 |
+
if (a or b or c or d) and check_keyword(lower_sentence):
|
367 |
+
date_list = get_date_list(reg_exp_1, lower_sentence)
|
368 |
+
date_entity = ''
|
369 |
+
if date_list:
|
370 |
+
date_entity = add_0(standardize(date_list[0]))
|
371 |
+
elif get_date_list(reg_exp_2, lower_sentence):
|
372 |
+
date_list = get_date_list(reg_exp_2, lower_sentence)
|
373 |
+
date_entity = add_0(standardize(date_list[0]) + '/' + year)
|
374 |
+
elif get_date_list(reg_exp_3, lower_sentence):
|
375 |
+
date_list = get_date_list(reg_exp_3, lower_sentence)
|
376 |
+
|
377 |
+
date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
|
378 |
+
date_entity = re.sub('\s+', ' ', date_entity)
|
379 |
+
date_entity = date_entity.replace(' ', '/')
|
380 |
+
date_entity = add_0(date_entity)
|
381 |
+
else:
|
382 |
+
date_list = get_date_list(reg_exp_4, lower_sentence)
|
383 |
+
if date_list != []:
|
384 |
+
date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
|
385 |
+
date_entity = re.sub('\s+', ' ', date_entity)
|
386 |
+
date_entity = date_entity.replace(' ', '/')
|
387 |
+
date_entity = date_entity + '/' + year
|
388 |
+
date_entity = add_0(date_entity)
|
389 |
+
result.append((sentence, date_entity))
|
390 |
+
return result
|
function/translate.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langdetect import detect
|
2 |
+
import requests
|
3 |
+
import json
|
4 |
+
import time
|
5 |
+
|
6 |
+
URL_TRANSLATOR = "http://10.9.3.241:8093/translator"
|
7 |
+
def detect_lang(text):
|
8 |
+
try:
|
9 |
+
lang = detect(text)
|
10 |
+
except:
|
11 |
+
lang = 'en'
|
12 |
+
return lang
|
13 |
+
|
14 |
+
def translate_text_multi_layer(source, target, text, url = URL_TRANSLATOR):
|
15 |
+
if source == "":
|
16 |
+
source = detect_lang(text)
|
17 |
+
print("PPPPPPPPPPPPP")
|
18 |
+
if not text.strip() or source == target:
|
19 |
+
return text
|
20 |
+
|
21 |
+
json_body = {
|
22 |
+
"doc": text,
|
23 |
+
"src_lang": source,
|
24 |
+
"tgt_lang": target
|
25 |
+
}
|
26 |
+
print("CCCCCCCCCCCC")
|
27 |
+
res= requests.post(url, json=json_body)
|
28 |
+
print("translate: ", res.status_code)
|
29 |
+
path_log = f"log_tran/requests_tran_{time.time()}.txt"
|
30 |
+
with open(path_log, "w", encoding="utf-8") as f:
|
31 |
+
f.write(json.dumps(json_body) + "\n")
|
32 |
+
if res.status_code == 200:
|
33 |
+
res = res.json()
|
34 |
+
with open(path_log, "a", encoding="utf-8") as f:
|
35 |
+
f.write(json.dumps(res) + "\n")
|
36 |
+
return res
|
37 |
+
return ''
|
function/utils.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import editdistance
|
2 |
+
import requests
|
3 |
+
import numpy as np
|
4 |
+
import re
|
5 |
+
from .clean_text import normalize_text
|
6 |
+
URL_SBERT = "http://10.9.3.240:6789/sbert/encode_list"
|
7 |
+
# app_config.parse_url_api('api_sbert')
|
8 |
+
|
9 |
+
def get_sbert_embedding(lst_sentence, url = URL_SBERT):
|
10 |
+
input_data = {
|
11 |
+
"sentences": lst_sentence
|
12 |
+
}
|
13 |
+
embs = requests.post(url, json=input_data).json()
|
14 |
+
embs = np.array(embs)
|
15 |
+
|
16 |
+
return embs
|
17 |
+
|
18 |
+
def is_number(word):
|
19 |
+
lst_end = ['$', '%', 'vnđ', '.', ',']
|
20 |
+
word_lo = word.lower()
|
21 |
+
for k in lst_end:
|
22 |
+
word_lo = word_lo.replace(k, '')
|
23 |
+
|
24 |
+
if word_lo.isdigit():
|
25 |
+
return True
|
26 |
+
else:
|
27 |
+
return False
|
28 |
+
|
29 |
+
|
30 |
+
def get_number(text):
|
31 |
+
dt = text.split(' ')
|
32 |
+
for w in dt:
|
33 |
+
if is_number(w):
|
34 |
+
return w
|
35 |
+
|
36 |
+
return ''
|
37 |
+
|
38 |
+
|
39 |
+
def check_editdistance(ww1, ww2):
|
40 |
+
if len(ww1) == 0 or len(ww1) == 0:
|
41 |
+
return 0
|
42 |
+
else:
|
43 |
+
n_c = editdistance.eval(ww1.lower(), ww2.lower())
|
44 |
+
score = n_c / max(len(ww1), len(ww2))
|
45 |
+
return 1 - score
|
46 |
+
|
47 |
+
|
48 |
+
def remove_image_keyword(text_input):
|
49 |
+
lst_key = ["ảnh:", "ảnh :", "Ảnh:", "Ảnh :",
|
50 |
+
"Ảnh minh họa:", "Ảnh minh họa :", "ảnh minh họa:", "ảnh minh họa :",
|
51 |
+
"Nguồn:", "nguồn:", "Nguồn :", "nguồn :",
|
52 |
+
"Source:", "Source :", "source:", "source :",
|
53 |
+
"Src:", "Src :", "src:", "src :",
|
54 |
+
"Image:", "Image :", "img:", "img :",
|
55 |
+
"image:", "image :", "Img:", "Img :",
|
56 |
+
"xem tiếp", "xem thêm", "Xem tiếp", "Xem thêm"]
|
57 |
+
for k in lst_key:
|
58 |
+
text_input = text_input.replace(k, " ")
|
59 |
+
return text_input.strip()
|
60 |
+
|
61 |
+
def clean_text(text_in, normalize=True):
|
62 |
+
doc = re.sub('<.*?>', '', text_in)
|
63 |
+
doc = re.sub('(function).*}', ' ', doc)
|
64 |
+
# link
|
65 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
|
66 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
|
67 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
|
68 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
|
69 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
|
70 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
|
71 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
|
72 |
+
doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
|
73 |
+
doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
|
74 |
+
|
75 |
+
doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
|
76 |
+
doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
|
77 |
+
doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
|
78 |
+
doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
|
79 |
+
doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
|
80 |
+
doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
|
81 |
+
doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
|
82 |
+
doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
|
83 |
+
doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
|
84 |
+
# escape sequence
|
85 |
+
doc = re.sub('\n', ' ', doc)
|
86 |
+
doc = re.sub('\t', ' ', doc)
|
87 |
+
doc = re.sub('\r', ' ', doc)
|
88 |
+
|
89 |
+
if normalize:
|
90 |
+
doc = normalize_text(doc)
|
91 |
+
return doc
|
92 |
+
|
93 |
+
if __name__ == '__main__':
|
94 |
+
print(check_editdistance('tttt', 'tt'))
|
get_config.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from yaml import load, Loader
|
2 |
+
|
3 |
+
config_params = {}
|
4 |
+
with open('config/config.yml', encoding='utf-8') as f:
|
5 |
+
config_params.update(load(f, Loader=Loader))
|
iclibs/ic_rabbit.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import pika
|
3 |
+
import json
|
4 |
+
|
5 |
+
|
6 |
+
class ICRabbitMQ(object):
|
7 |
+
def __init__(self, host, virtual_host, usr, passw, **kwargs):
|
8 |
+
"""
|
9 |
+
Khởi tạo
|
10 |
+
:param host: địa chỉ rabbitmq server
|
11 |
+
:param virtual_host: virtual_host
|
12 |
+
:param queue_name: tên queue
|
13 |
+
:param usr: user rabbitmq server
|
14 |
+
:param passw: password
|
15 |
+
"""
|
16 |
+
self.host = host
|
17 |
+
self.virtual_host = virtual_host
|
18 |
+
self.user = usr
|
19 |
+
self.passw = passw
|
20 |
+
self.credentials = pika.PlainCredentials(usr, passw)
|
21 |
+
self.connection = None
|
22 |
+
self.kwargs = kwargs
|
23 |
+
|
24 |
+
def init_connection(self):
|
25 |
+
self.connection = \
|
26 |
+
pika.BlockingConnection(
|
27 |
+
pika.ConnectionParameters(host=self.host, virtual_host=self.virtual_host, credentials=self.credentials))
|
28 |
+
|
29 |
+
def connection_close(self):
|
30 |
+
self.connection.close()
|
31 |
+
|
32 |
+
def connection_status(self):
|
33 |
+
return self.connection.is_open
|
34 |
+
|
35 |
+
def init_queue(self, queue_name, exchange="", exchange_type='fanout', durable=True, max_priority=-1):
|
36 |
+
"""
|
37 |
+
khởi tạo queue
|
38 |
+
:param exchange:
|
39 |
+
:param queue_name: tên queue
|
40 |
+
:param durable: true (Queue vẫn tồn tại nếu nhưng RabitMQ khởi động lại)
|
41 |
+
:param max_priority: Mức độ priority tối đa; None thì không xét priority;
|
42 |
+
khác None thì xét priority (tối đa priority = 10)
|
43 |
+
:return: channel
|
44 |
+
"""
|
45 |
+
if self.connection is None:
|
46 |
+
self.init_connection()
|
47 |
+
channel = self.connection.channel()
|
48 |
+
if exchange == "" and queue_name != "":
|
49 |
+
if max_priority == -1:
|
50 |
+
channel.queue_declare(queue=queue_name, durable=durable)
|
51 |
+
else:
|
52 |
+
channel.queue_declare(queue=queue_name, durable=durable, arguments={'x-max-priority': max_priority})
|
53 |
+
else:
|
54 |
+
channel.exchange_declare(exchange=exchange, exchange_type='fanout', durable=durable)
|
55 |
+
return channel
|
56 |
+
|
57 |
+
@staticmethod
|
58 |
+
def publish_message(channel, routing_key, body, priority=-1, delivery_mode=2, exchange=''):
|
59 |
+
"""
|
60 |
+
run pushlish message
|
61 |
+
:param channel: channel đã được tạo
|
62 |
+
:param routing_key: key hoặc tên queue (nếu exchange = '')
|
63 |
+
:param body: data push
|
64 |
+
:param priority: mức ưu tiên
|
65 |
+
:param delivery_mode: ??
|
66 |
+
:param exchange: routing
|
67 |
+
"""
|
68 |
+
if priority == -1:
|
69 |
+
channel.basic_publish(exchange=exchange, routing_key=routing_key, body=json.dumps(body),
|
70 |
+
properties=pika.BasicProperties(delivery_mode=delivery_mode))
|
71 |
+
else:
|
72 |
+
channel.basic_publish(exchange=exchange, routing_key=routing_key, body=json.dumps(body),
|
73 |
+
properties=pika.BasicProperties(delivery_mode=delivery_mode, priority=priority))
|
74 |
+
print("push done: ")
|
75 |
+
|
76 |
+
@staticmethod
|
77 |
+
def run_consummer(channel, queue_name, callback_func, is_ack=False):
|
78 |
+
"""
|
79 |
+
run consumer
|
80 |
+
:param channel: channel đã được tạo
|
81 |
+
:param queue_name: tên queue
|
82 |
+
:param callback_func: hàm callback được định nghĩa bởi người dùng
|
83 |
+
:return:
|
84 |
+
"""
|
85 |
+
print(" *wait message")
|
86 |
+
|
87 |
+
def callback(ch, method, properties, body):
|
88 |
+
body = json.loads(body.decode("utf-8"))
|
89 |
+
if is_ack:
|
90 |
+
ch.basic_ack(delivery_tag=method.delivery_tag)
|
91 |
+
callback_func(body, properties)
|
92 |
+
else:
|
93 |
+
callback_func(body, properties)
|
94 |
+
ch.basic_ack(delivery_tag=method.delivery_tag)
|
95 |
+
|
96 |
+
|
97 |
+
print("receive done: ")
|
98 |
+
|
99 |
+
channel.basic_qos(prefetch_count=10)
|
100 |
+
channel.basic_consume(queue=queue_name, on_message_callback=callback)
|
101 |
+
channel.start_consuming()
|
102 |
+
|
103 |
+
|
104 |
+
if __name__ == '__main__':
|
105 |
+
pass
|
106 |
+
# host, virtual_host, usr, passw = '10.9.2.220', 'camera-detect', 'tuan.dao', 'lEKUWKXwFt'
|
107 |
+
# rab = ICRabbitMQ(host, virtual_host, usr, passw)
|
108 |
+
# queue_name = 'test_rb'
|
109 |
+
#
|
110 |
+
# ## test run producer
|
111 |
+
# channel = rab.init_queue(queue_name)
|
112 |
+
# body_data = {"2": "3423432423"}
|
113 |
+
# ICRabbitMQ.publish_message(channel, queue_name, body_data)
|
114 |
+
#
|
115 |
+
#
|
116 |
+
# ## test run consumer
|
117 |
+
# def callback_func(body):
|
118 |
+
# """
|
119 |
+
# function callback người dùng định nghĩa
|
120 |
+
# :param body: message từ queue
|
121 |
+
# :return:
|
122 |
+
# """
|
123 |
+
# print(body)
|
124 |
+
#
|
125 |
+
#
|
126 |
+
# ICRabbitMQ.run_consummer(channel, queue_name, callback_func)
|