cuongnguyen910 commited on
Commit
5120311
1 Parent(s): 411c450

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +3 -0
  2. .gitignore +21 -0
  3. Dockerfile +46 -0
  4. Dockerfile_gpu +33 -0
  5. Dockerfile_gpu_Thien +34 -0
  6. Jenkinsfile +22 -0
  7. README.md +55 -12
  8. config/__init__.py +1 -0
  9. config/cfg.yaml +173 -0
  10. config/config.py +57 -0
  11. config/config.yml +38 -0
  12. consumer_clustering.py +181 -0
  13. consumer_clustering_mnews.py +137 -0
  14. consumer_hot_topic_ondemand.py +209 -0
  15. consumer_merge_clustering.py +96 -0
  16. consumer_merge_clustering_newscms.py +109 -0
  17. deployment.yaml +28 -0
  18. docker/api_trt/Dockerfile +22 -0
  19. docker/cpu/Dockerfile +46 -0
  20. docker/gpu/Dockerfile_gpu +33 -0
  21. docker_build.txt +6 -0
  22. entity/__init__.py +1 -0
  23. entity/__pycache__/__init__.cpython-37.pyc +0 -0
  24. entity/__pycache__/__init__.cpython-38.pyc +0 -0
  25. entity/__pycache__/types.cpython-37.pyc +0 -0
  26. entity/__pycache__/types.cpython-38.pyc +0 -0
  27. entity/types.py +0 -0
  28. function/SessionProcess.py +55 -0
  29. function/__init__.py +3 -0
  30. function/__pycache__/SessionProcess.cpython-37.pyc +0 -0
  31. function/__pycache__/SessionProcess.cpython-38.pyc +0 -0
  32. function/__pycache__/__init__.cpython-37.pyc +0 -0
  33. function/__pycache__/__init__.cpython-38.pyc +0 -0
  34. function/__pycache__/tc_v2.cpython-37.pyc +0 -0
  35. function/__pycache__/tc_v2.cpython-38.pyc +0 -0
  36. function/clean_text.py +84 -0
  37. function/detect_time.py +92 -0
  38. function/embed_vncorenlp.py +161 -0
  39. function/sentence_embbeding.py +41 -0
  40. function/summary_with_llm.py +210 -0
  41. function/tc_v2.py +573 -0
  42. function/topic_clustering.py +458 -0
  43. function/topic_clustering_mnews.py +339 -0
  44. function/topic_clustering_not_summary.py +463 -0
  45. function/topic_clustering_social.py +156 -0
  46. function/topic_clustering_v2.py +390 -0
  47. function/translate.py +37 -0
  48. function/utils.py +94 -0
  49. get_config.py +5 -0
  50. iclibs/ic_rabbit.py +126 -0
.dockerignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *venv*
2
+ log
3
+ log_run
.gitignore ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ venv/
2
+ data/
3
+ vncore/
4
+ global/
5
+ cls/
6
+ check.json
7
+ test3.py
8
+ test2.py
9
+ time_test.py
10
+ vncorenlp
11
+ *venv*
12
+ __pycache__/
13
+ req_daily
14
+ log*
15
+ sample
16
+ model
17
+ *.json
18
+ *test*
19
+ docker_venv
20
+ core
21
+ models
Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7
2
+
3
+ WORKDIR /app
4
+
5
+
6
+ RUN apt-get update && apt-get install build-essential cmake git -y
7
+
8
+ #----------------JRE (for vncorenlp)--------------
9
+ RUN apt-get update && \
10
+ DEBIAN_FRONTEND=noninteractive \
11
+ apt-get -y install default-jre-headless && \
12
+ apt-get clean && \
13
+ rm -rf /var/lib/apt/lists/*
14
+
15
+ RUN apt-get install unzip
16
+ RUN pip install gdown
17
+ RUN gdown --id 1MTAPYy9AcYtfiJ6m_pz6MPeA6li8pYp7
18
+ RUN unzip vncorenlp.zip -d /app/
19
+
20
+
21
+ # COPY ./model /app/model
22
+
23
+ RUN mkdir -p /app/model
24
+ RUN mkdir -p /app/log
25
+ RUN mkdir -p /app/log_run
26
+
27
+ COPY reqs_cpu.txt /app/
28
+ RUN pip install -r reqs_cpu.txt
29
+
30
+ COPY ./load_model.py /app/
31
+ RUN python load_model.py
32
+
33
+ COPY ./config /app/config
34
+ COPY ./entity /app/entity
35
+ COPY ./function /app/function
36
+ COPY ./main_cache.py /app/
37
+ COPY ./service_cache.py /app/
38
+ COPY ./summary.py /app/
39
+ COPY ./merge_topic.py /app/
40
+ COPY ./consumer_clustering.py /app/
41
+ COPY ./consumer_merge_clustering.py /app/
42
+ COPY ./run_multi_process.py /app/
43
+
44
+ RUN rm -r ~/.cache/pip/*
45
+
46
+ CMD ["python", "run_multi_process.py"]
Dockerfile_gpu ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7
2
+
3
+ WORKDIR /app
4
+
5
+
6
+ RUN apt-get update && apt-get install build-essential cmake git -y
7
+
8
+ #----------------JRE (for vncorenlp)--------------
9
+ RUN apt-get update && \
10
+ DEBIAN_FRONTEND=noninteractive \
11
+ apt-get -y install default-jre-headless && \
12
+ apt-get clean && \
13
+ rm -rf /var/lib/apt/lists/*
14
+
15
+ COPY ./model /app/model
16
+
17
+ RUN pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
18
+ RUN mkdir log
19
+ RUN mkdir log_run
20
+
21
+ COPY req.txt /app/
22
+ RUN pip install -r req.txt
23
+
24
+ COPY ./entity /app/entity
25
+ COPY ./function /app/function
26
+ COPY ./vncorenlp /app/vncorenlp
27
+ COPY ./main_cache.py /app/
28
+ COPY ./service_cache.py /app/
29
+ COPY ./summary.py /app/
30
+
31
+ RUN rm -r ~/.cache/pip/*
32
+
33
+ CMD ["python", "main_cache.py"]
Dockerfile_gpu_Thien ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7.17-slim-bullseye
2
+
3
+ WORKDIR /app
4
+
5
+
6
+ RUN apt-get update && apt-get install build-essential cmake git -y
7
+
8
+ #----------------JRE (for vncorenlp)--------------
9
+ RUN apt-get update && \
10
+ DEBIAN_FRONTEND=noninteractive \
11
+ apt-get -y install default-jre-headless && \
12
+ apt-get clean && \
13
+ rm -rf /var/lib/apt/lists/*
14
+
15
+ COPY ./model /app/model
16
+ COPY ./tensorRT /app/tensorRT
17
+
18
+ RUN pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
19
+ RUN mkdir log
20
+ RUN mkdir log_run
21
+
22
+ COPY req.txt /app/
23
+ RUN pip install -r req.txt
24
+
25
+ COPY ./entity /app/entity
26
+ COPY ./function /app/function
27
+ COPY ./vncorenlp /app/vncorenlp
28
+ COPY ./main_cache_Thien.py /app/
29
+ COPY ./service_cache_Thien.py /app/
30
+ COPY ./summary.py /app/
31
+
32
+ RUN rm -r ~/.cache/pip/*
33
+
34
+ CMD ["python", "main_cache_Thien.py"]
Jenkinsfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ node {
2
+ checkout scm
3
+
4
+ env.DOCKER_API_VERSION="1.23"
5
+ registry_host = env.registry_host
6
+ appName = "clusteringcpu"
7
+
8
+ sh "git rev-parse --short HEAD > commit-id"
9
+ tag = readFile('commit-id').replace("\n", "").replace("\r", "")
10
+
11
+ stage("build"){
12
+ sh "docker build --tag ${registry_host}/${appName}:${tag} --file Dockerfile ."
13
+ }
14
+ stage("push"){
15
+ sh "docker push ${registry_host}/${appName}:${tag}"
16
+ }
17
+ stage("deploy"){
18
+ sh "sed -i s/{{tag}}/${tag}/g deployment.yaml \
19
+ && sed -i 's|{{registry_host}}|${registry_host}|g' deployment.yaml \
20
+ && kubectl ${env.token_kube} apply -f deployment.yaml"
21
+ }
22
+ }
README.md CHANGED
@@ -1,12 +1,55 @@
1
- ---
2
- title: Topic Clustering Global Dashboard
3
- emoji: 🏆
4
- colorFrom: gray
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 4.43.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: topic-clustering-global-dashboard
3
+ app_file: service_periodic.py
4
+ sdk: gradio
5
+ sdk_version: 4.43.0
6
+ ---
7
+ ## Build docker
8
+ ```
9
+ docker build -t clustering-dashboard .
10
+ docker run -d --restart=always --name clustering-dashboard clustering-dashboard
11
+ ```
12
+
13
+ ## Deploy TRT
14
+ ```
15
+ scp -r docker/api_trt/Dockerfile ./
16
+ docker build -t api-clustering .
17
+ docker run --gpus 1 --dns 8.8.8.8 -it -p 8633:8633 -v /home/dp04/topic-clustering-global-dashboard:/app -t --name api-clustering api-clustering
18
+ ```
19
+
20
+ ### Topic Clustering
21
+ 1. API clustering port 8633: dùng cho phân cụm NEWS trong các loại báo cáo ( báo cáo nhanh, báo cáo tổng quan, báo cáo template,...)
22
+ - Command:
23
+ ```
24
+ docker run --gpus all -it --rm -v /home/vietle/topic-clustering:/home/vietle/topic-clustering --name topic_clustering_trt_sb -p 8636:8633 topic-clustering-trt
25
+ cd /home/vietle/topic-clustering && CUDA_VISIBLE_DEVICES=<cuda_device_id> python main_cache.py
26
+ ```
27
+ - screen: 52097.clustering.trt
28
+ 2. API clustering social: Phân cụm tin tức social
29
+ - Command: cd /home/vietle/topic-clustering/ && source docker_venv/bin/activate && uvicorn service_social:app --host 0.0.0.0 --port 8635
30
+ - Screen: 37529.clustering.social
31
+ 3. Phân cụm cho global dashboard:
32
+ - API:
33
+ ```
34
+ CMD: systemctl status bzo_clustering_api_cpu.service (port 8634)
35
+ Screen: 21445.clustering.cpu
36
+ ```
37
+ - Consumer:
38
+ + Phân cụm daily:
39
+ ```
40
+ Command: cd /home/vietle/topic-clustering/ && source docker_venv/bin/activate && OMP_NUM_THREADS=16 CUDA_VISIBLE_DEVICES=-1 python consumer_clustering.py
41
+ Screen: 16024.bzo.clustering_daily
42
+ ```
43
+ + Phân cụm weekly/monthly:
44
+ ```
45
+ Command: cd /home/vietle/topic-clustering/ && source docker_venv/bin/activate && OMP_NUM_THREADS=16 CUDA_VISIBLE_DEVICES='-1' python consumer_merge_clustering.py
46
+ Screen: 60767.bzo.clustering_monthly
47
+ ```
48
+
49
+ 4. Phân cụm điểm tin:
50
+ - Command:
51
+ ```
52
+ docker run --gpus all -it --rm -v /home/vietle/topic-clustering:/home/vietle/topic-clustering --name topic_clustering_trt_sb -p 8636:8633 topic-clustering-trt
53
+ cd /home/vietle/topic-clustering/ && python main_cache_Thien.py
54
+ ```
55
+
config/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .config import get_config, parse_connection_string
config/cfg.yaml ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AppSettings:
2
+ ConnectionStrings:
3
+ facebook_info: Server=10.9.3.141,1433; Database=facebook_info; User Id='facebook_info_read'; Password='dh2uxJny'; Pooling=True; Connect Timeout=45; MultipleActiveResultSets=True; Encrypt=False
4
+ GraylogConfig:
5
+ ConfigManager:
6
+ ChannelConfigs:
7
+ MemcachedConfigs:
8
+ QueueConfigs:
9
+ queue_topic_clustering:
10
+ Arguments: {}
11
+ AutoAck: false
12
+ AutoDelete: false
13
+ AutomaticRecoveryEnabled: true
14
+ ConsumerMaxRetry: 0
15
+ Durable: true
16
+ ErrorCounter: 100
17
+ ErrorCounterTotalSeconds: 60
18
+ Exchange: ''
19
+ ExchangeArguments:
20
+ ExchangeAutoDelete: false
21
+ ExchangeDurable: true
22
+ ExchangeType:
23
+ Exclusive: false
24
+ FailedQueue:
25
+ HostName: 10.9.3.251
26
+ HostNames:
27
+ - 10.9.3.251
28
+ Servers:
29
+ - Host: 10.9.3.251
30
+ Port: 15672
31
+ Key: queue1
32
+ MaxWorkpool: 1
33
+ NetworkRecoveryInterval: 120
34
+ Password: 1
35
+ PrefetchCount: 200
36
+ Queue: topic-clustering
37
+ QueueBindArguments:
38
+ QueueBindRoutingKey:
39
+ RequestedHeartbeat: 120
40
+ TopologyRecoveryEnabled: false
41
+ UserName: long.nguyen
42
+ VirtualHost: posts-broadcast
43
+ MessageDeliveryMode:
44
+ IsProducer: true
45
+ IsConsumer: true
46
+ queue_merge_clustering:
47
+ Arguments: {}
48
+ AutoAck: false
49
+ AutoDelete: false
50
+ AutomaticRecoveryEnabled: true
51
+ ConsumerMaxRetry: 0
52
+ Durable: true
53
+ ErrorCounter: 100
54
+ ErrorCounterTotalSeconds: 60
55
+ Exchange: ''
56
+ ExchangeArguments:
57
+ ExchangeAutoDelete: false
58
+ ExchangeDurable: true
59
+ ExchangeType:
60
+ Exclusive: false
61
+ FailedQueue:
62
+ HostName: 10.9.3.251
63
+ HostNames:
64
+ - 10.9.3.251
65
+ Servers:
66
+ - Host: 10.9.3.251
67
+ Port: 15672
68
+ Key: queue2
69
+ MaxWorkpool: 1
70
+ NetworkRecoveryInterval: 120
71
+ Password: 1
72
+ PrefetchCount: 200
73
+ Queue: merge-clustering
74
+ QueueBindArguments:
75
+ QueueBindRoutingKey:
76
+ RequestedHeartbeat: 120
77
+ TopologyRecoveryEnabled: false
78
+ UserName: long.nguyen
79
+ VirtualHost: posts-broadcast
80
+ MessageDeliveryMode:
81
+ IsProducer: true
82
+ IsConsumer: true
83
+ queue_topic_clustering_mnews:
84
+ Arguments: {}
85
+ AutoAck: false
86
+ AutoDelete: false
87
+ AutomaticRecoveryEnabled: true
88
+ ConsumerMaxRetry: 0
89
+ Durable: true
90
+ ErrorCounter: 100
91
+ ErrorCounterTotalSeconds: 60
92
+ Exchange: ''
93
+ ExchangeArguments:
94
+ ExchangeAutoDelete: false
95
+ ExchangeDurable: true
96
+ ExchangeType:
97
+ Exclusive: false
98
+ FailedQueue:
99
+ HostName: 10.9.3.251
100
+ HostNames:
101
+ - 10.9.3.251
102
+ Servers:
103
+ - Host: 10.9.3.251
104
+ Port: 15672
105
+ Key: queue1
106
+ MaxWorkpool: 1
107
+ NetworkRecoveryInterval: 120
108
+ Password: 1
109
+ PrefetchCount: 200
110
+ Queue: topic-clustering-mnews
111
+ QueueBindArguments:
112
+ QueueBindRoutingKey:
113
+ RequestedHeartbeat: 120
114
+ TopologyRecoveryEnabled: false
115
+ UserName: long.nguyen
116
+ VirtualHost: posts-broadcast
117
+ MessageDeliveryMode:
118
+ IsProducer: true
119
+ IsConsumer: true
120
+ queue_merge_clustering_newscms:
121
+ Arguments: {}
122
+ AutoAck: false
123
+ AutoDelete: false
124
+ AutomaticRecoveryEnabled: true
125
+ ConsumerMaxRetry: 0
126
+ Durable: true
127
+ ErrorCounter: 100
128
+ ErrorCounterTotalSeconds: 60
129
+ Exchange: ''
130
+ ExchangeArguments:
131
+ ExchangeAutoDelete: false
132
+ ExchangeDurable: true
133
+ ExchangeType: fanout
134
+ Exclusive: false
135
+ FailedQueue:
136
+ HostName: 10.9.3.251
137
+ HostNames:
138
+ - 10.9.3.251
139
+ Servers:
140
+ - Host: 10.9.3.251
141
+ Port: 5672
142
+ Key: queue_merge_clustering_newscms
143
+ MaxWorkpool: 1
144
+ NetworkRecoveryInterval: 120
145
+ Password: 1
146
+ PrefetchCount: 200
147
+ Queue: newscms-merge-clustering
148
+ QueueBindArguments:
149
+ QueueBindRoutingKey:
150
+ RequestedHeartbeat: 120
151
+ TopologyRecoveryEnabled: false
152
+ UserName: long.nguyen
153
+ VirtualHost: news-cms
154
+ MessageDeliveryMode:
155
+ IsProducer: true
156
+ IsConsumer: true
157
+ AppSettings:
158
+ ApiConnects:
159
+ api_save_clustering:
160
+ BaseUrl: https://staging.pontusinc.com/api/category_management/v1/topic-clustering-dashboard/insert
161
+ IsMonitor: true
162
+ MonitorUrl: https://staging.pontusinc.com/api/category_management/v1/topic-clustering-dashboard/insert
163
+ api_sbert:
164
+ BaseUrl: http://10.9.3.240:6789/sbert/encode_list
165
+ IsMonitor: true
166
+ MonitorUrl: http://10.9.3.240:6789/sbert/encode_list
167
+
168
+ api_save_cluster_newscms:
169
+ BaseUrl: https://staging.pontusinc.com/api/news_cms/News/update_cluster_result
170
+ IsMonitor: true
171
+ MonitorUrl: https://staging.pontusinc.com/api/news_cms/News/update_cluster_result
172
+ GraylogConfig:
173
+ key_test: test
config/config.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import os
4
+ from yaml import load
5
+ try:
6
+ from yaml import CLoader as Loader, CDumper as Dumper
7
+ except ImportError:
8
+ from yaml import Loader, Dumper
9
+
10
+
11
+ URL_CFG = "http://icomm-api-configserver/api/configserver/v1/configuration.yaml"
12
+ # http://10.9.2.151:31244/api/configserver/v1/configuration.yaml
13
+ # cấu hình file hosts ở thư mục C:\Windows\System32\drivers\etc như sau:
14
+ # 123.31.42.17 icomm-api-configserver
15
+
16
+ AccessToken = "wbecrEfJk8F36y0WojqBQaqT28d6NaBnCLBgkoO2sCg3aNhYACkSxMNvWwlsAj5k"
17
+ Environment = "Production"
18
+ path_save_cfg = "config/cfg.yaml"
19
+
20
+
21
+ def get_config():
22
+ cfg = None
23
+ try:
24
+ payload = json.dumps({
25
+ "AccessToken": AccessToken,
26
+ "Environment": Environment
27
+ })
28
+ headers = {
29
+ 'accept': 'text/plain',
30
+ 'Content-Type': 'application/json-patch+json'
31
+ }
32
+
33
+ response = requests.request("POST", URL_CFG, headers=headers, data=payload)
34
+ # if response.status_code == 200:
35
+ # with open(path_save_cfg, "w+") as f:
36
+ # f.write(response.text)
37
+ except Exception as ve:
38
+ print(ve)
39
+ if os.path.exists(path_save_cfg):
40
+ with open(path_save_cfg) as f:
41
+ cfg = load(f, Loader)
42
+ return cfg
43
+
44
+
45
+ def parse_connection_string(str_cnn):
46
+ res = dict()
47
+ split_dt = str_cnn.split(";")
48
+ for c_sp in split_dt:
49
+ k, v = c_sp.split("=")
50
+ res[k.strip()] = v.replace("'", "").replace('"', '')
51
+ return res
52
+
53
+
54
+ if __name__ == '__main__':
55
+ cf = get_config()
56
+ print(cf)
57
+ print(parse_connection_string(cf["ConfigManager"]["ConnectionStrings"]["facebook_info"]["Value"]))
config/config.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ queue_topic_clustering:
2
+ host:
3
+ 10.9.3.251
4
+ virtual_host:
5
+ posts-broadcast
6
+ queue_name:
7
+ topic-clustering
8
+ usr_name:
9
+ long.nguyen
10
+ password:
11
+ 1
12
+
13
+ queue_topic_clustering_mnews:
14
+ host:
15
+ 10.9.3.251
16
+ virtual_host:
17
+ posts-broadcast
18
+ queue_name:
19
+ topic-clustering-mnews
20
+ usr_name:
21
+ long.nguyen
22
+ password:
23
+ 1
24
+
25
+ queue_merge_clustering:
26
+ host:
27
+ 10.9.3.251
28
+ virtual_host:
29
+ posts-broadcast
30
+ queue_name:
31
+ merge-clustering
32
+ usr_name:
33
+ long.nguyen
34
+ password:
35
+ 1
36
+
37
+ api_save_clustering:
38
+ https://staging.pontusinc.com/api/category_management/v1/topic-clustering-dashboard/insert
consumer_clustering.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pika
2
+ import os
3
+ # os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
4
+
5
+ import json
6
+ import time
7
+ # from get_config import config_params
8
+ from config import get_config
9
+ from function import topic_clustering_not_summary as tc
10
+ from function import topic_clustering_social
11
+ import requests
12
+
13
+ config_params = get_config()
14
+ ConfigManager = config_params['ConfigManager']
15
+
16
+ last_time_check = time.time()
17
+ def update_result(result, type='daily', meta = {}):
18
+ benchmark_children_id = -1
19
+ benchmark_id = -1
20
+ source_tagids = []
21
+ for id_cluster in result:
22
+ for doc in result[id_cluster][:1]:
23
+ source_tagids = doc.get('source_tagids',[])
24
+ for key in doc:
25
+ if "benchmark_child" in key:
26
+ benchmark_children_id = int(key.lstrip('benchmark_child_'))
27
+ if "benchmark" in key and 'child' not in key:
28
+ benchmark_id = int(key.lstrip('benchmark_'))
29
+ break
30
+
31
+ if not source_tagids:
32
+ source_tagids = []
33
+
34
+ if len(source_tagids) > 0:
35
+ benchmark_id = 0
36
+ benchmark_children_id = 0
37
+
38
+ output = {
39
+ "benchmark_id": benchmark_id,
40
+ "benchmark_children_id": benchmark_children_id,
41
+ "source_tagids": source_tagids,
42
+ "country_code": meta.get('country_code',''),
43
+ "type": type,
44
+ "data": json.dumps(result)
45
+ }
46
+ # with open('test_result.json','w') as f:
47
+ # json.dump(output, f, ensure_ascii=False)
48
+
49
+ # url = config_params['api_save_clustering']
50
+ url = ConfigManager['ApiConnects']['api_save_clustering']['BaseUrl']
51
+ # with open("/home/vietle/topic-clustering/config/save.json", 'w') as f:
52
+ # json.dump(output, f,ensure_ascii=False)
53
+ res = requests.post(url, json = output)
54
+ print(res.text)
55
+ print('Update result !!!!!!!!!')
56
+
57
+ def callback_func(ch, method, properties, body):
58
+ print("receive done: ")
59
+
60
+
61
+
62
+ starttime = time.time()
63
+ body = json.loads(body.decode("utf-8"))
64
+
65
+ # with open('input_daily.json','w') as f:
66
+ # json.dump(body, f, ensure_ascii=False)
67
+
68
+ docs = body['docs']
69
+ # threshold = body['threshold']
70
+ threshold = 0.25
71
+ top_cluster = body['top_cluster']
72
+ top_sentence = body['top_sentence']
73
+ topn_summary = body['topn_summary']
74
+ hash_str = body['hash_str']
75
+ st_time = body['st_time']
76
+ meta = body.get('meta',{})
77
+ country_code = meta.get("country_code", "")
78
+ delete_message = False if country_code in ["ICOMM-RND","SOCIAL"] else True
79
+
80
+ print("country_code: ", country_code, "meta: ", meta)
81
+
82
+ is_cache = False
83
+ try:
84
+ with open("log_run/log.txt") as f:
85
+ data_dict = json.load(f)
86
+ except Exception as ve:
87
+ print(ve)
88
+ data_dict = {}
89
+
90
+ try:
91
+ if hash_str in data_dict:
92
+ path_res = data_dict[hash_str]["response_path"]
93
+ with open(path_res) as ff:
94
+ results = json.load(ff)
95
+ print("time analysis (cache): ", time.time() - st_time)
96
+ update_result(results,meta=meta)
97
+ is_cache = True
98
+ except Exception as vee:
99
+ print(vee)
100
+
101
+ if not is_cache:
102
+ if country_code in ["SOCIAL"]:
103
+ results = topic_clustering_social.topic_clustering(docs, threshold, top_cluster=top_cluster, top_sentence=top_sentence,
104
+ topn_summary=topn_summary, delete_message=delete_message)
105
+ else:
106
+ results = tc.topic_clustering(docs, threshold, top_cluster=top_cluster, top_sentence=top_sentence,
107
+ topn_summary=topn_summary, delete_message=delete_message)
108
+ update_result(results, meta=meta)
109
+
110
+ path_res = "log/result_{0}.txt".format(hash_str)
111
+ with open(path_res, "w+") as ff:
112
+ ff.write(json.dumps(results))
113
+
114
+ data_dict[hash_str] = {"time": st_time, "response_path": path_res}
115
+
116
+ lst_rm = []
117
+ global last_time_check
118
+ if time.time() - last_time_check > 3600:
119
+ print("check log to del .....")
120
+ last_time_check = time.time()
121
+ for dt in data_dict:
122
+ if time.time() - data_dict[dt]["time"] > 30 * 24 * 3600:
123
+ lst_rm.append(dt)
124
+ for dt in lst_rm:
125
+ del data_dict[dt]
126
+ with open("log_run/log.txt", "w+") as ff:
127
+ ff.write(json.dumps(data_dict))
128
+ print("time analysis: ", time.time() - starttime)
129
+ ch.basic_ack(delivery_tag=method.delivery_tag)
130
+
131
+
132
+ def test():
133
+ with open('req_daily/aus.json') as f:
134
+ body = json.load(f)
135
+
136
+ docs = body['response']['docs']
137
+ # threshold = body['threshold']
138
+ threshold = 0.25
139
+ top_cluster = body['top_cluster']
140
+ top_sentence = body['top_sentence']
141
+ topn_summary = body['topn_summary']
142
+ # hash_str = body['hash_str']
143
+ # st_time = body['st_time']
144
+ meta = body['response'].get('meta',{})
145
+ results = tc.topic_clustering(docs, threshold, top_cluster=top_cluster, top_sentence=top_sentence,
146
+ topn_summary=topn_summary, delete_message=True)
147
+ print(results)
148
+ # update_result(results, meta=meta)
149
+ # print(123)
150
+ if __name__ == '__main__':
151
+ # test()
152
+ params = ConfigManager['QueueConfigs']['queue_topic_clustering']
153
+ usr_name = params["UserName"]
154
+ password = str(params["Password"])
155
+ host = params["HostName"]
156
+ virtual_host = params["VirtualHost"]
157
+ queue_name = params["Queue"]
158
+
159
+
160
+
161
+ # params = config_params['queue_topic_clustering']
162
+ # usr_name = params["usr_name"]
163
+ # password = str(params["password"])
164
+ # host = params["host"]
165
+ # virtual_host = params["virtual_host"]
166
+ # queue_name = params["queue_name"]
167
+
168
+ while True:
169
+ try:
170
+ credentials = pika.PlainCredentials(usr_name, password)
171
+ connection = pika.BlockingConnection(
172
+ pika.ConnectionParameters(host=host, virtual_host=virtual_host, credentials=credentials, heartbeat=3600, blocked_connection_timeout=3600))
173
+ channel = connection.channel()
174
+ channel.queue_declare(queue=queue_name, durable=True, arguments={"x-max-priority": 10})
175
+ print(" * wait message")
176
+ channel.basic_qos(prefetch_count=1)
177
+ channel.basic_consume(queue=queue_name, on_message_callback=callback_func)
178
+ channel.start_consuming()
179
+ except Exception as ex:
180
+ print(f'[ERROR] ', ex)
181
+ # raise ex
consumer_clustering_mnews.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pika
2
+ import os
3
+ # os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
4
+
5
+ import json
6
+ import time
7
+ # from get_config import config_params
8
+ from config import get_config
9
+ from function import topic_clustering_mnews as tc
10
+ import requests
11
+
12
+ config_params = get_config()
13
+ ConfigManager = config_params['ConfigManager']
14
+
15
+ last_time_check = time.time()
16
+ def update_result(result, type='daily', meta = {}):
17
+ command_id = meta["command_id"]
18
+
19
+ output = {
20
+ "id": command_id,
21
+ "status": 2,
22
+ "json_result": json.dumps(result)
23
+ }
24
+
25
+ url = "https://staging.pontusinc.com/api/news_management/v1/quick_search/update-command-result"
26
+ # with open("/home/vietle/topic-clustering/config/save_.json", 'w') as f:
27
+ # json.dump(output, f,ensure_ascii=False)
28
+ res = requests.post(url, json = output)
29
+ print(res.text)
30
+ print('Update result !!!!!!!!!')
31
+
32
+ def callback_func(ch, method, properties, body):
33
+ print("receive done: ")
34
+ starttime = time.time()
35
+ body = json.loads(body.decode("utf-8"))
36
+
37
+ # with open('input_daily.json','w') as f:
38
+ # json.dump(body, f, ensure_ascii=False)
39
+
40
+ docs = body['docs']
41
+ # threshold = body['threshold']
42
+ command_id = body.get("command_id",0)
43
+ print(command_id)
44
+ threshold = 0.1
45
+ top_cluster = body['top_cluster']
46
+ top_sentence = body['top_sentence']
47
+ topn_summary = body['topn_summary']
48
+ hash_str = body['hash_str']
49
+ st_time = body['st_time']
50
+ meta = body.get('meta',{})
51
+ meta["command_id"] = command_id
52
+
53
+ is_cache = False
54
+ try:
55
+ with open("log_mnews/log/log.txt") as f:
56
+ data_dict = json.load(f)
57
+ except Exception as ve:
58
+ print(ve)
59
+ data_dict = {}
60
+
61
+ try:
62
+ if hash_str in data_dict:
63
+ path_res = data_dict[hash_str]["response_path"]
64
+ with open(path_res) as ff:
65
+ results = json.load(ff)
66
+ print("time analysis (cache): ", time.time() - st_time)
67
+ update_result(results,meta=meta)
68
+ is_cache = True
69
+ except Exception as vee:
70
+ print(vee)
71
+
72
+ if not is_cache:
73
+ results = tc.topic_clustering(docs, threshold, top_cluster=top_cluster, top_sentence=top_sentence,
74
+ topn_summary=topn_summary)
75
+ update_result(results, meta=meta)
76
+
77
+ path_res = "log_mnews/result_{0}.txt".format(hash_str)
78
+ with open(path_res, "w+") as ff:
79
+ ff.write(json.dumps(results))
80
+
81
+ data_dict[hash_str] = {"time": st_time, "response_path": path_res}
82
+
83
+ lst_rm = []
84
+ global last_time_check
85
+ if time.time() - last_time_check > 3600:
86
+ print("check log to del .....")
87
+ last_time_check = time.time()
88
+ for dt in data_dict:
89
+ if time.time() - data_dict[dt]["time"] > 30 * 24 * 3600:
90
+ lst_rm.append(dt)
91
+ for dt in lst_rm:
92
+ del data_dict[dt]
93
+ with open("log_mnews/log/log.txt", "w+") as ff:
94
+ ff.write(json.dumps(data_dict))
95
+ print("time analysis: ", time.time() - starttime)
96
+ ch.basic_ack(delivery_tag=method.delivery_tag)
97
+
98
+
99
+ def test():
100
+ with open('req_daily/aus.json') as f:
101
+ body = json.load(f)
102
+
103
+ docs = body['response']['docs']
104
+ # threshold = body['threshold']
105
+ threshold = 0.25
106
+ top_cluster = body['top_cluster']
107
+ top_sentence = body['top_sentence']
108
+ topn_summary = body['topn_summary']
109
+ # hash_str = body['hash_str']
110
+ # st_time = body['st_time']
111
+ meta = body['response'].get('meta',{})
112
+ results = tc.topic_clustering(docs, threshold, top_cluster=top_cluster, top_sentence=top_sentence,
113
+ topn_summary=topn_summary)
114
+ update_result(results, meta=meta)
115
+ print(123)
116
+ if __name__ == '__main__':
117
+ params = ConfigManager['QueueConfigs']['queue_topic_clustering_mnews']
118
+ usr_name = params["UserName"]
119
+ password = str(params["Password"])
120
+ host = params["HostName"]
121
+ virtual_host = params["VirtualHost"]
122
+ queue_name = params["Queue"]
123
+
124
+ while True:
125
+ try:
126
+ credentials = pika.PlainCredentials(usr_name, password)
127
+ connection = pika.BlockingConnection(
128
+ pika.ConnectionParameters(host=host, virtual_host=virtual_host, credentials=credentials, heartbeat=3600, blocked_connection_timeout=3600))
129
+ channel = connection.channel()
130
+ channel.queue_declare(queue=queue_name, durable=True, arguments={"x-max-priority": 10})
131
+ print(" * wait message")
132
+ channel.basic_qos(prefetch_count=1)
133
+ channel.basic_consume(queue=queue_name, on_message_callback=callback_func)
134
+ channel.start_consuming()
135
+ except Exception as ex:
136
+ print(f'[ERROR] ', ex)
137
+ # raise ex
consumer_hot_topic_ondemand.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from entity import InputHotTopic, ResponseQueue
4
+ from threading import Thread
5
+ from queue import Queue
6
+ import time
7
+ import json, requests
8
+ from service_cache_Thien import get_data_sorl
9
+ import os
10
+ from datetime import datetime
11
+ from email_validator import push_msg_tele
12
+
13
+ import time
14
+ import json
15
+ import hashlib
16
+
17
+
18
+ from pydantic import BaseModel
19
+
20
+ class InputHotTopic(BaseModel):
21
+ start_time: str = "2024-09-03 23:00:00"
22
+ end_time: str = "2024-09-05 23:00:00"
23
+ query: str = "Giá nhà chung cư trên Hà Nội"
24
+ keywords: list = ["chung cư, Hà Nội", "Hoà Lạc"]
25
+ top_cluster: int = 5
26
+ prompt: str = """Trong 300 từ, hãy tổng hợp thành một đoạn văn một cách đầy đủ, chi tiết, và trung thực về các chủ đề xung quanh biến động giá nhà chung cư Hà Nội từ nội dung dưới đây.
27
+ Nếu không có thông tin gì liên quan đến giá nhà chung cư Hà Nội trong nội dung cung cấp thì trả lời "không có thông tin". Không đưa quan điểm cá nhân, không lặp lại một phần câu hỏi, loại bỏ phần mở đầu. Không có những câu từ liên kết như: "Sau đây là nội dung tóm tắt", "Nội dung tóm tắt là", "Dưới đây là " ... """
28
+ check_relevent: str = "Hãy đánh giá nội dung dưới đây có thông tin liên quan đến giá cả nhà chung cư Hà Nội hay không? Chỉ trả lời có hoặc không, không đưa thêm thông tin không liên quan"
29
+ max_posts: int = 5000
30
+
31
+ def get_hash_id(item: InputHotTopic):
32
+ str_hash = ""
33
+ if item.id_topic:
34
+ str_hash += item.id_topic
35
+ str_hash += item.start_time
36
+ return hashlib.sha224(str_hash.encode("utf-8")).hexdigest()
37
+ else:
38
+ return ""
39
+
40
+ class SessionProcess(object):
41
+
42
+ def __init__(self):
43
+ self.session = dict()
44
+
45
+ def hash_session(self, query: InputHotTopic):
46
+ hash_dict = query.dict()
47
+ hash_dict['time'] = int(time.time())
48
+ return hashlib.sha224(json.dumps(hash_dict).encode("utf-8")).hexdigest()
49
+
50
+ def insert_session(self, data_input):
51
+ print('data_input: ', data_input)
52
+ # if self.mode == "command_center":
53
+ # hash_id = hash_session(data_input)
54
+ # else:
55
+ hash_id = self.hash_session(data_input)
56
+ if hash_id not in self.session:
57
+ self.session[hash_id] = {"status": 0, "created_time": time.time(), "update_time": time.time(),
58
+ "result": {}, "data": data_input}
59
+ return hash_id
60
+
61
+ def get_info_session(self, hash_id: str):
62
+ if hash_id in self.session:
63
+ return self.session[hash_id]
64
+ return {"status": -2, "result": {}, "meta": {}}
65
+
66
+ def update_session(self, hash_id: str, result: dict, status: int):
67
+ if hash_id in self.session:
68
+ self.session[hash_id]["status"] = status
69
+ self.session[hash_id]["result"] = result
70
+ self.session[hash_id]["update_time"] = time.time()
71
+ return True
72
+ return False
73
+
74
+ def delete_session(self, hash_id: str):
75
+ if hash_id in self.session:
76
+ del self.session[hash_id]
77
+ return True
78
+ return False
79
+
80
+ SESSION = SessionProcess()
81
+ app = FastAPI(title="Hot Topic")
82
+ app.add_middleware(
83
+ CORSMiddleware,
84
+ allow_origins=["*"],
85
+ allow_credentials=True,
86
+ allow_methods=["*"],
87
+ allow_headers=["*"],
88
+ )
89
+
90
+ NUM_OF_THREAD = 2
91
+ QQ = Queue(maxsize=0) # don't limit queue
92
+
93
+ def process_wc():
94
+ print('Run thr')
95
+ global SESSION, QQ
96
+ while True:
97
+ if not QQ.empty():
98
+ hash_id = QQ.get()
99
+ SESSION.update_session(hash_id, {}, 0)
100
+ print("update trạng thái status = 0: đang xử lý")
101
+ try:
102
+ ss_info = SESSION.get_info_session(hash_id)
103
+ status = ss_info["status"]
104
+ print("trạng thái hiện tại: ", status)
105
+ if status == 0:
106
+ data_input = SESSION.session[hash_id]["data"]
107
+ res_doc = get_data_sorl(data_input.query, data_input.keywords, data_input.start_time, data_input.end_time, max_posts = data_input.max_posts)
108
+ print('lenght res_doc: ', len(res_doc))
109
+ if not res_doc:
110
+ SESSION.update_session(hash_id, {}, -1)
111
+ else:
112
+ # start_time: str = "2024-03-03 23:00:00"
113
+ current_time = datetime.now()
114
+ time_now = current_time.strftime("%Y-%m-%d %H:%M:%S")
115
+ d = {
116
+ "id_topic": "99999",
117
+ "start_time": time_now,
118
+ "end_time": data_input.end_time,
119
+ "threshold": 0.3,
120
+ "top_sentence": -1,
121
+ "top_cluster": data_input.top_cluster,
122
+ "topn_summary": 10,
123
+ "type_cluster": "",
124
+ "lang_process": "",
125
+ "prompt": data_input.prompt,
126
+ "topic_name": data_input.check_relevent,
127
+ "responseHeader": {},
128
+ "benchmark_topics": [],
129
+ "response": {"docs": res_doc}
130
+ }
131
+
132
+ str_hash = ""
133
+ str_hash += "99999"
134
+ str_hash += time_now
135
+ hash_id_path = hashlib.sha224(str_hash.encode("utf-8")).hexdigest()
136
+
137
+ st_time = time.time()
138
+ try:
139
+ response = requests.post('http://10.9.3.241:8636/newsanalysis/topic_clustering', json=d, timeout=5)
140
+ except:
141
+ print("Timeout done")
142
+
143
+ print("push done msg")
144
+ res_clus = {}
145
+ # flag = False
146
+ # count = 0
147
+ # while not flag and count < 18000:
148
+ # if os.path.exists("/home/vietle/topic-clustering/log/result_{0}.txt".format(hash_id_path)):
149
+ # path_res = "/home/vietle/topic-clustering/log/result_{0}.txt".format(hash_id_path)
150
+ # with open(path_res, encoding="utf-8") as ff:
151
+ # res_clus = json.load(ff)
152
+ # res_clus["num_articles"] = len(res_doc)
153
+ # message = "Hello"
154
+ # push_msg_tele(data_input.bot_token , data_input.chat_id , message)
155
+ # print('done processing result')
156
+ # flag = True
157
+ # time.sleep(1)
158
+ # count +=1
159
+ # print('sleep: ', count)
160
+
161
+
162
+ print("update done msg")
163
+ SESSION.update_session(hash_id_path, res_clus, 1)
164
+ except Exception as ve_:
165
+ print(ve_)
166
+ SESSION.update_session(hash_id_path, {}, -1)
167
+ raise ve_
168
+ else:
169
+ time.sleep(2)
170
+
171
+
172
+ for _ in range(NUM_OF_THREAD):
173
+ worker = Thread(target=process_wc, args=())
174
+ worker.setDaemon(True)
175
+ worker.start()
176
+
177
+
178
+ @app.post("/api/v1/send_message")
179
+ def send_requests(item: InputHotTopic):
180
+ global SESSION
181
+ hash_id = SESSION.insert_session(item)
182
+ if SESSION.session[hash_id]["status"] == 0:
183
+ QQ.put(hash_id)
184
+
185
+ return ResponseQueue(statusCode=1, message="Push to queue done !", result={"hash_id": hash_id})
186
+
187
+ class InputSession(BaseModel):
188
+ hash_id: str = ""
189
+
190
+ class Response(BaseModel):
191
+ statusCode: int = 200
192
+ message: str = ""
193
+ result: dict = {}
194
+ @app.post("/api/mining/qna/result")
195
+ def get_result(item: InputSession):
196
+ global SESSION
197
+ res = SESSION.get_info_session(item.hash_id)
198
+ status = res["status"]
199
+ res = res["result"]
200
+ if status == -1:
201
+ msg = "ERROR"
202
+ elif status == 0:
203
+ msg = "processing ..."
204
+ elif status == 1:
205
+ msg = "done"
206
+ # SESSION.delete_session(item.hash_id)
207
+ else:
208
+ msg = "nothing"
209
+ return Response(statusCode=status, message=msg, result=res)
consumer_merge_clustering.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # os.environ["CUDA_VISIBLE_DEVICES"] ="-1"
3
+ import pika
4
+ import json
5
+ import time
6
+ import requests
7
+
8
+ from merge_topic import main
9
+ # from get_config import config_params
10
+ from config import get_config
11
+
12
+ config_params = get_config()
13
+ ConfigManager = config_params['ConfigManager']
14
+
15
+ def update_result(result, type='daily', meta = {}):
16
+ benchmark_children_id = -1
17
+ benchmark_id = -1
18
+ source_tagids = []
19
+ for id_cluster in result:
20
+ for doc in result[id_cluster][:1]:
21
+ source_tagids = doc.get('source_tagids',[])
22
+ for key in doc:
23
+ if "benchmark_child" in key:
24
+ benchmark_children_id = int(key.lstrip('benchmark_child_'))
25
+ if "benchmark" in key and 'child' not in key:
26
+ benchmark_id = int(key.lstrip('benchmark_'))
27
+ break
28
+
29
+ if not source_tagids:
30
+ source_tagids = []
31
+ if len(source_tagids) > 0:
32
+ benchmark_id = 0
33
+ benchmark_children_id = 0
34
+
35
+ output = {
36
+ "benchmark_id": benchmark_id,
37
+ "benchmark_children_id": benchmark_children_id,
38
+ "source_tagids": source_tagids,
39
+ "country_code": meta.get('country_code',''),
40
+ "type": type,
41
+ "data": json.dumps(result)
42
+ }
43
+
44
+ # with open('test_result.json','w') as f:
45
+ # json.dump(output, f, ensure_ascii=False)
46
+
47
+ # url = config_params['api_save_clustering']
48
+ url = ConfigManager['ApiConnects']['api_save_clustering']['BaseUrl']
49
+
50
+ res = requests.post(url, json = output)
51
+ print(res.text)
52
+ print('Update result !!!!!!!!!')
53
+
54
+ def callback_func(ch, method, properties, body):
55
+ print("receive done: ")
56
+ starttime = time.time()
57
+ body = json.loads(body.decode("utf-8"))
58
+
59
+ req = body
60
+ type = req['type']
61
+ meta = req.get('meta', {})
62
+ res = main(req)
63
+ update_result(res, type, meta=meta)
64
+ print('Time process:', time.time() - starttime)
65
+ ch.basic_ack(delivery_tag=method.delivery_tag)
66
+
67
+
68
+ if __name__ == '__main__':
69
+ params = ConfigManager['QueueConfigs']['queue_merge_clustering']
70
+ usr_name = params["UserName"]
71
+ password = str(params["Password"])
72
+ host = params["HostName"]
73
+ virtual_host = params["VirtualHost"]
74
+ queue_name = params["Queue"]
75
+
76
+ # params = config_params['queue_merge_clustering']
77
+ # usr_name = params["usr_name"]
78
+ # password = str(params["password"])
79
+ # host = params["host"]
80
+ # virtual_host = params["virtual_host"]
81
+ # queue_name = params["queue_name"]
82
+
83
+ while True:
84
+ try:
85
+ credentials = pika.PlainCredentials(usr_name, password)
86
+ connection = pika.BlockingConnection(
87
+ pika.ConnectionParameters(host=host, virtual_host=virtual_host, credentials=credentials, heartbeat=3600, blocked_connection_timeout=3600))
88
+ channel = connection.channel()
89
+ channel.queue_declare(queue=queue_name, durable=True, arguments={"x-max-priority": 10})
90
+ print(" * wait message")
91
+ channel.basic_qos(prefetch_count=1)
92
+ channel.basic_consume(queue=queue_name, on_message_callback=callback_func)
93
+ channel.start_consuming()
94
+ except Exception as ex:
95
+ print(f'[ERROR] ', ex)
96
+ # raise ex
consumer_merge_clustering_newscms.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+ # os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
3
+
4
+ import pika
5
+ import json
6
+ import time
7
+ import requests
8
+
9
+ from merge_topic import main
10
+ # from get_config import config_params
11
+ from config import get_config
12
+
13
+ config_params = get_config()
14
+ ConfigManager = config_params['ConfigManager']
15
+ URL_SAVE_CLUSTERING_CMS = ConfigManager["ApiConnects"]["api_save_cluster_newscms"]["BaseUrl"]
16
+
17
+ def update_result(result, id, meta = {}):
18
+ print(result)
19
+ print("-----")
20
+ output = {
21
+ "id": id,
22
+ "result":json.dumps(result)
23
+ }
24
+ res = requests.post(url=URL_SAVE_CLUSTERING_CMS, json = output)
25
+ print(res.text)
26
+ print('Update result !!!!!!!!!')
27
+
28
+ def callback_func(ch, method, properties, body):
29
+ print("receive done: ")
30
+ starttime = time.time()
31
+ body = json.loads(body.decode("utf-8"))
32
+ with open("/home/vietle/topic-clustering/input_merge1.json", 'w') as f:
33
+ json.dump(body,f,ensure_ascii = False)
34
+ req = body
35
+ req["type"] = "monthly"
36
+ id = req["id"]
37
+ meta = req.get('meta', {})
38
+
39
+ preprocess_reformat = []
40
+ preprocess = req["preprocess"]
41
+ for daily_clusters in preprocess:
42
+
43
+ dict_cluster = {}
44
+ for i,doc in enumerate(daily_clusters["topic"]):
45
+ reps_post = doc
46
+ lst_doc = [reps_post]
47
+ lst_doc.extend(doc.get("list_posts", []))
48
+ dict_cluster[i] = lst_doc
49
+ it = {
50
+ "topic": dict_cluster
51
+ }
52
+ preprocess_reformat.append(it)
53
+ req["preprocess"] = preprocess_reformat
54
+
55
+ res = main(req)
56
+ update_result(res, id, meta=meta)
57
+ print('Time process:', time.time() - starttime)
58
+ ch.basic_ack(delivery_tag=method.delivery_tag)
59
+
60
+
61
+ def test():
62
+ with open("/home/vietle/topic-clustering/input_merge1.json", 'r') as f:
63
+ body = json.load(f)
64
+
65
+ req = body
66
+ req["type"] = "monthly"
67
+ id = req["id"]
68
+ meta = req.get('meta', {})
69
+
70
+ preprocess_reformat = []
71
+ preprocess = req["preprocess"]
72
+ for daily_clusters in preprocess:
73
+
74
+ dict_cluster = {}
75
+ for i,topic in enumerate(daily_clusters["topic"]):
76
+ dict_cluster[i] = topic
77
+ it = {
78
+ "topic": dict_cluster
79
+ }
80
+ preprocess_reformat.append(it)
81
+ req["preprocess"] = preprocess_reformat
82
+ with open("/home/vietle/topic-clustering/input_merge2.json", 'w') as f:
83
+ json.dump(req,f,ensure_ascii = False)
84
+ res = main(req)
85
+
86
+ if __name__ == '__main__':
87
+ # test()
88
+ params = ConfigManager['QueueConfigs']['queue_merge_clustering_newscms']
89
+ usr_name = params["UserName"]
90
+ password = str(params["Password"])
91
+ host = params["HostName"]
92
+ virtual_host = params["VirtualHost"]
93
+ queue_name = params["Queue"]
94
+
95
+
96
+ while True:
97
+ try:
98
+ credentials = pika.PlainCredentials(usr_name, password)
99
+ connection = pika.BlockingConnection(
100
+ pika.ConnectionParameters(host=host, virtual_host=virtual_host, credentials=credentials, heartbeat=3600, blocked_connection_timeout=3600))
101
+ channel = connection.channel()
102
+ channel.queue_declare(queue=queue_name, durable=True, arguments={"x-max-priority": 10})
103
+ print(" * wait message")
104
+ channel.basic_qos(prefetch_count=1)
105
+ channel.basic_consume(queue=queue_name, on_message_callback=callback_func)
106
+ channel.start_consuming()
107
+ except Exception as ex:
108
+ print(f'[ERROR] ', ex)
109
+ raise ex
deployment.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: apps/v1beta1
2
+ kind: Deployment
3
+ metadata:
4
+ name: clusteringcpu
5
+ labels:
6
+ app: clusteringcpu
7
+ spec:
8
+ strategy:
9
+ type: Recreate
10
+ template:
11
+ metadata:
12
+ labels:
13
+ app: clusteringcpu
14
+ tier: clusteringcpu
15
+ spec:
16
+ containers:
17
+ - image: {{registry_host}}/clusteringcpu:{{tag}}
18
+ name: clusteringcpu
19
+ resources:
20
+ requests:
21
+ cpu: 6
22
+ memory: 60000Mi
23
+ limits:
24
+ cpu: 10
25
+ memory: 100000Mi
26
+ ports:
27
+ - containerPort:
28
+ name:
docker/api_trt/Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvcr.io/nvidia/tensorrt:22.07-py3
2
+
3
+ EXPOSE 8633
4
+ WORKDIR /app
5
+
6
+ RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && pip install --upgrade pip
7
+
8
+ RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && apt-get update && \
9
+ DEBIAN_FRONTEND=noninteractive \
10
+ apt-get -y install default-jre-headless && \
11
+ apt-get clean && \
12
+ rm -rf /var/lib/apt/lists/*
13
+
14
+ COPY req.txt /app/
15
+ RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && pip install -r req.txt
16
+ RUN echo "nameserver 8.8.8.8" > /etc/resolv.conf && pip install --upgrade pip && pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
17
+
18
+ COPY ./main_cache.py /app/
19
+
20
+
21
+ RUN rm -r ~/.cache/pip/*
22
+ CMD ["python3", "main_cache.py"]
docker/cpu/Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7
2
+
3
+ WORKDIR /app
4
+
5
+
6
+ RUN apt-get update && apt-get install build-essential cmake git -y
7
+
8
+ #----------------JRE (for vncorenlp)--------------
9
+ RUN apt-get update && \
10
+ DEBIAN_FRONTEND=noninteractive \
11
+ apt-get -y install default-jre-headless && \
12
+ apt-get clean && \
13
+ rm -rf /var/lib/apt/lists/*
14
+
15
+ RUN apt-get install unzip
16
+ RUN pip install gdown
17
+ RUN gdown --id 1MTAPYy9AcYtfiJ6m_pz6MPeA6li8pYp7
18
+ RUN unzip vncorenlp.zip -d /app/
19
+
20
+
21
+ # COPY ./model /app/model
22
+
23
+ RUN mkdir -p /app/model
24
+ RUN mkdir -p /app/log
25
+ RUN mkdir -p /app/log_run
26
+
27
+ COPY reqs_cpu.txt /app/
28
+ RUN pip install -r reqs_cpu.txt
29
+
30
+ COPY ./load_model.py /app/
31
+ RUN python load_model.py
32
+
33
+ COPY ./config /app/config
34
+ COPY ./entity /app/entity
35
+ COPY ./function /app/function
36
+ COPY ./main_cache.py /app/
37
+ COPY ./service_cache.py /app/
38
+ COPY ./summary.py /app/
39
+ COPY ./merge_topic.py /app/
40
+ COPY ./consumer_clustering.py /app/
41
+ COPY ./consumer_merge_clustering.py /app/
42
+ COPY ./run_multi_process.py /app/
43
+
44
+ RUN rm -r ~/.cache/pip/*
45
+
46
+ CMD ["python", "run_multi_process.py"]
docker/gpu/Dockerfile_gpu ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.7
2
+
3
+ WORKDIR /app
4
+
5
+
6
+ RUN apt-get update && apt-get install build-essential cmake git -y
7
+
8
+ #----------------JRE (for vncorenlp)--------------
9
+ RUN apt-get update && \
10
+ DEBIAN_FRONTEND=noninteractive \
11
+ apt-get -y install default-jre-headless && \
12
+ apt-get clean && \
13
+ rm -rf /var/lib/apt/lists/*
14
+
15
+ COPY ./model /app/model
16
+
17
+ RUN pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
18
+ RUN mkdir log
19
+ RUN mkdir log_run
20
+
21
+ COPY req.txt /app/
22
+ RUN pip install -r req.txt
23
+
24
+ COPY ./entity /app/entity
25
+ COPY ./function /app/function
26
+ COPY ./vncorenlp /app/vncorenlp
27
+ COPY ./main_cache.py /app/
28
+ COPY ./service_cache.py /app/
29
+ COPY ./summary.py /app/
30
+
31
+ RUN rm -r ~/.cache/pip/*
32
+
33
+ CMD ["python", "main_cache.py"]
docker_build.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ docker build -t topic_clustering .
2
+ docker run -d --restart=always -p8633:8633 --name topic_clustering topic_clustering
3
+ /////docker run -p8633:8633 --name topic_clustering topic_clustering
4
+
5
+ docker build -t clustering-dashboard .
6
+ docker run --name clustering-dashboard clustering-dashboard
entity/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .types import *
entity/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (181 Bytes). View file
 
entity/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (205 Bytes). View file
 
entity/__pycache__/types.cpython-37.pyc ADDED
Binary file (129 kB). View file
 
entity/__pycache__/types.cpython-38.pyc ADDED
Binary file (129 kB). View file
 
entity/types.py ADDED
The diff for this file is too large to render. See raw diff
 
function/SessionProcess.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import time
3
+ import json
4
+ import hashlib
5
+
6
+
7
+ from pydantic import BaseModel
8
+
9
+ class InputHotTopic(BaseModel):
10
+ start_time: str = ""
11
+ end_time: str = ""
12
+ query: str = ""
13
+ keywords: list = []
14
+ top_cluster: int = 5
15
+ prompt: str = ""
16
+ check_relevent: str = ""
17
+ class SessionProcess(object):
18
+
19
+ def __init__(self):
20
+ self.session = dict()
21
+
22
+ def hash_session(self, query: InputHotTopic):
23
+ hash_dict = query.dict()
24
+ hash_dict['time'] = int(time.time())
25
+ return hashlib.sha224(json.dumps(hash_dict).encode("utf-8")).hexdigest()
26
+
27
+ def insert_session(self, data_input):
28
+ print('data_input: ', data_input)
29
+ # if self.mode == "command_center":
30
+ # hash_id = hash_session(data_input)
31
+ # else:
32
+ hash_id = self.hash_session(data_input)
33
+ if hash_id not in self.session:
34
+ self.session[hash_id] = {"status": 0, "created_time": time.time(), "update_time": time.time(),
35
+ "result": {}, "data": data_input}
36
+ return hash_id
37
+
38
+ def get_info_session(self, hash_id: str):
39
+ if hash_id in self.session:
40
+ return self.session[hash_id]
41
+ return {"status": -2, "result": {}, "meta": {}}
42
+
43
+ def update_session(self, hash_id: str, result: dict, status: int):
44
+ if hash_id in self.session:
45
+ self.session[hash_id]["status"] = status
46
+ self.session[hash_id]["result"] = result
47
+ self.session[hash_id]["update_time"] = time.time()
48
+ return True
49
+ return False
50
+
51
+ def delete_session(self, hash_id: str):
52
+ if hash_id in self.session:
53
+ del self.session[hash_id]
54
+ return True
55
+ return False
function/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # from . import topic_clustering_v2
2
+ # from . import sentence_embedding
3
+ from SessionProcess import SessionProcess
function/__pycache__/SessionProcess.cpython-37.pyc ADDED
Binary file (1.84 kB). View file
 
function/__pycache__/SessionProcess.cpython-38.pyc ADDED
Binary file (1.88 kB). View file
 
function/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (161 Bytes). View file
 
function/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (220 Bytes). View file
 
function/__pycache__/tc_v2.cpython-37.pyc ADDED
Binary file (15.9 kB). View file
 
function/__pycache__/tc_v2.cpython-38.pyc ADDED
Binary file (15.4 kB). View file
 
function/clean_text.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def normalize_text(text):
4
+ # text = text.lower()
5
+ text = text.replace('🏻', '')
6
+ full_text_clean = text
7
+ full_text_clean.replace(");this.closest('table').remove();","")
8
+ full_text_clean = re.sub('(Thứ .{2,4}|Chủ nhật),( ngày)? \d{1,2}\/\d{1,2}\/\d{4}( \d{1,2}:\d{1,2})?( AM| PM)?( \(GMT.{1,3}\))?','',full_text_clean)
9
+ if not any([word in full_text_clean[:15].lower() for word in ['nga','covid']]):
10
+ full_text_clean = re.sub('^.{1,15}?-','',full_text_clean)
11
+ # full_text_clean = re.sub('-.{1,15}?$','',full_text_clean)
12
+
13
+ full_text_clean = re.sub('- VNExplorer','',full_text_clean)
14
+ full_text_clean = re.sub('Theo .{1,20}$','',full_text_clean)
15
+ full_text_clean = re.sub('\(.*(Ảnh|Nguồn).*?\)','',full_text_clean)
16
+ full_text_clean = re.sub('\d{1,2} (giờ|phút) trước','',full_text_clean)
17
+ full_text_clean = re.sub(r"http\S+", "", full_text_clean)
18
+ # full_text_clean = re.sub('(\\r)*( )*(\\n)*( )*(\\r)*( )*(\\n)','.', full_text_clean)
19
+ full_text_clean = re.sub(r"\.( )*(\.)+", '. ', full_text_clean)
20
+ full_text_clean = re.sub('\.(?!\d)', '. ', full_text_clean)
21
+ full_text_clean = re.sub('(\.(\s)+)+', '. ', full_text_clean)
22
+ full_text_clean = re.sub('<[^<]+?>', '',full_text_clean)
23
+ full_text_clean = re.sub('\d{1,2}:\d{2}( )?\d{1,2}\/\d{1,2}\/\d{4}','',full_text_clean)
24
+ full_text_clean = re.sub("Ảnh(:)?(Getty)?","", full_text_clean)
25
+ full_text_clean = full_text_clean.replace("Read more about:","").replace("Read more","").replace("Advertising","").replace("bookmark border.","").replace('the latest tech news, global tech news daily, tech news today, startups, usa tech, asia tech, china tech, eu tech, global tech, in-depth electronics reviews, 24h tech news, 24h tech news, top mobile apps, tech news daily, gaming hardware, big tech news, useful technology tips, expert interviews, reporting on the business of technology, venture capital funding, programing language',"").replace('Live updates:','').replace('-VNExplorer','').replace('Reuters:','').replace('AFP:','').replace('�','').replace('- VNExplorer','').replace('Global Tech News Daily','').replace('AFP/TTXVN','').replace('Reuters/TTXVN','').replace('Tin quốc tế','').replace('Xem tiện ích cảnh báo giá CK','')
26
+ full_text_clean = full_text_clean.replace("COVID","Covid")
27
+ full_text_clean = re.sub('[A-Z ]{2,10}(,.{6,20})?(—|–|-|-)','',full_text_clean) #NEW YORK, Feb 27 — .... /BRUSSELS—...
28
+ full_text_clean = re.sub('\(ảnh:.*?\)','.',full_text_clean)
29
+ full_text_clean = re.sub("(\| )?(\(.{1,7}\)( )?)+$", "", full_text_clean)
30
+ full_text_clean = re.sub('\d{2} [\w]{3,4}, \d{4}. \d{2}.\d{2} (AM|PM) IST','',full_text_clean) #02 Mar, 2022, 10.01 AM IST
31
+ full_text_clean = full_text_clean.replace('Suzuka. config. supports_premium_subscription && window. localStorage. getItem ( "premiumSubscription ")) ) {var e = document. createElement ( "script "); e. setAttribute ( "class ", "titan-conditional "); e. setAttribute ( "data-ad-id ", "adspot-300x250-pos1 "); document. body. appendChild (e);}','')
32
+
33
+ full_text_clean = re.sub('\d{2}\/\d{2}\/\d{4} \d{2}:\d{2} GMT(\+|-)\d{1,2}', "", full_text_clean)
34
+ full_text_clean = re.sub('\(.{1,20}\)', '', full_text_clean)
35
+ full_text_clean = re.sub('\{.{1,20}\}', '', full_text_clean)
36
+ full_text_clean = re.sub('\[.{1,20}\]', '', full_text_clean)
37
+ full_text_clean = re.sub('[A-Z].{5,10} , \d{2}:\d{2} (GMT(\+|-)\d{1,2})?',"",full_text_clean)
38
+
39
+ full_text_clean = re.sub('(Theo|theo) .{1,15},', '', full_text_clean)
40
+ full_text_clean = re.sub('(Theo|theo) .{1,15},', '', full_text_clean)
41
+ full_text_clean = re.sub('theo.{3,20}$','', full_text_clean)
42
+ full_text_clean = re.sub('^Trong khi đó','', full_text_clean)
43
+
44
+ full_text_clean = re.sub('^\d{1,10} minutes ago', '', full_text_clean)
45
+ full_text_clean = re.sub('^\d{1,10} hours ago', '', full_text_clean)
46
+ full_text_clean = re.sub('^\d{1,10} days ago', '', full_text_clean)
47
+ full_text_clean = re.sub('^\d{1,10} years ago', '', full_text_clean)
48
+ full_text_clean = re.sub('^\d{1,10} months ago', '', full_text_clean)
49
+ full_text_clean = re.sub('^\d{1,10} minute ago', '', full_text_clean)
50
+ full_text_clean = re.sub('^\d{1,10} day ago', '', full_text_clean)
51
+ full_text_clean = re.sub('^\d{1,10} year ago', '', full_text_clean)
52
+ full_text_clean = re.sub('^\d{1,10} month ago', '', full_text_clean)
53
+ full_text_clean = re.sub('^\d{1,10} hour ago', '', full_text_clean)
54
+ full_text_clean = re.sub('^(a|an) minute ago', '', full_text_clean)
55
+ full_text_clean = re.sub('^(a|an) hour ago', '', full_text_clean)
56
+ full_text_clean = re.sub('^(a|an) day ago', '', full_text_clean)
57
+ full_text_clean = re.sub('^(a|an) month ago', '', full_text_clean)
58
+ full_text_clean = re.sub('^(a|an) year ago', '', full_text_clean)
59
+ full_text_clean = re.sub('^.{0,12}(tờ|theo|nguồn|trích dẫn|trang|báo|hãng).{1,30}(đưa tin|dẫn l��i|trích dẫn|nhận định|cho biết)', '', full_text_clean, flags=re.I)
60
+
61
+ text = re.sub('\s+',' ',full_text_clean)
62
+ text = re.sub('Đọc chi tiết bài viết tại đây.*','',text,flags=re.I)
63
+ # text = re.sub('[(\d)(\:)(\|)(\/)(\s+)]+','',text) # 10:20 | 09/03/2022
64
+
65
+ text = re.sub('(\d{1,2}:\d{2}( )*)\|( )*\d{1,2}(/|-)\d{2}(/|-)\d{4}','',text)
66
+ text = re.sub('^(\d)+[\,\.]\s+ ','',text) # 3, phát ngôn viên Bộ T
67
+ text = re.sub('((chủ nhật)|(thứ bảy)|(thử sáu)|(thứ năm)|(thứ tư)|(thứ ba)|(thứ hai))([(\d)(\:)(,)(\|\/)(\s+)]+)((VOV)|(VTV))$','',text,flags=re.I) # và Ukraine để giải quyết xung đột Chủ Nhật, 06:32, 20/03/2022 VOV.
68
+
69
+ text = re.sub('^((\d)|(\:)|(\.)|(\|)|(\s+)|(in bài biết)|(in bài viết)|(\/))+ ','',text,flags=re.I) # 10:20 | 09/03/2022 In bài biết. 10:20 | 09/03/2022 In bài biết Việc xuất khẩu tôm sang thị trường Nga có thể bị ảnh hưởng trước tình hình căng thẳng của Nga-Ukraine. Hiệp hội Chế biến và Xuất khẩu thuỷ sản V
70
+ text = re.sub('theo hãng tin [A-Z].{0,15},','', text, flags=re.I)
71
+ text = re.sub('((Theo.{0,30})|(Reuters)).*?(link gốc).*?$','',text,flags=re.I)
72
+ text = re.sub('video:RIA Novosti/Bộ Quốc phòng Nga','',text,flags=re.I)
73
+ text = re.sub('Báo.{5,20}$','',text)
74
+ text = re.sub('RIA Novosti/Bộ Quốc phòng Nga','',text)
75
+ text = re.sub('(chính trị|quân sự|đối ngoại|thời sự|khoa học|pháp luật) \d{1,2} giờ','',text,flags=re.I)
76
+ text = text.replace('|','')
77
+ full_text_clean = re.sub('^.*?(Link nguồn)','',text,flags=re.I) # (
78
+ full_text_clean = re.sub(',( )*[A-z].{1,30}(đưa tin|trích dẫn)','', full_text_clean)
79
+ full_text_clean = re.sub('(Reuters|Vnexpress)(\).)?','',full_text_clean,flags=re.I)
80
+ full_text_clean = re.sub('^VOV.','',full_text_clean)
81
+ full_text_clean = full_text_clean.replace("Many Japanese worry Fortune-Takashi Nakamichi, Bloomberg • 1d","").replace('baotintuc. vn',"").replace('YÊN BÁI QUYẾT TÂM THỰC HIỆN THẮNG LỢI CHƯƠNG TRÌNH HÀNH ĐỘNG SỐ 56 – CTr/TU CỦA TỈNH ỦY QUYẾT TÂM ĐƯA Y.ÊN BÁI PHÁT TRIỂN NHANH, BỀN VỮNG THEO HƯỚNG “XANH, HÀI HÒA, BẢN SẮC VÀ HẠNH PHÚC” TRỞ THÀNH TỈNH PHÁT TRIỂN KHÁ VÀO NĂM 2025','')
82
+ full_text_clean = full_text_clean.replace("Baoquocte","").replace('ชั่วโมงที่ผ่านมา.','').replace('""challenge=" "coron= ""corona=" "covid-19= ""designs=" "endgame= ""tutorial=" "ui= ""pandemic=" "quarantine= ""list=" "similarity= " "ux. ""press=" "copyright= ""contact=" "creators= ""advertise=" "terms= ""privacy=" "policy= ""safety=" "youtube= ""works=" "test= ""features=" "creators.', '').replace('nbsp & nbsp & nbsp & nbsp & nbsp Copy Link', '').replace('Tổng thống Nga Vladimir Putin và Chủ tịch Trung Quốc Tập Cận Bình.','').replace('Thế giới Toàn cảnh Bảo Hà ','')
83
+ full_text_clean = re.sub('(a|p)\. m\.','',full_text_clean)
84
+ return full_text_clean
function/detect_time.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ import datetime
4
+ import operator
5
+ from typing import *
6
+ from dateutil.relativedelta import *
7
+ from itertools import groupby
8
+ from dateparser import parse
9
+
10
+ day = '[0-3]{0,1}[0-9]'
11
+ month = '[0,1]{0,1}[0-9]'
12
+ year = '\d{4}'
13
+ sep = '\s*[-/\.\s]\s*'
14
+
15
+ patterns = [
16
+ f"{day}{sep}{month}{sep}{year}",
17
+ f"{year}{sep}{month}{sep}{day}",
18
+ f"{day}\s+tháng\s+{month}",
19
+ f"{day}\s+tháng\s+{month}{sep}{year}",
20
+ f"{day}\s+tháng\s+{month}\s+năm\s+{year}",
21
+ f"{day}\s+tháng\s+{month}",
22
+ f"(?<=ngày)\s+{day}{sep}{month}",
23
+ f"(?<=ngày)\s+{day}{sep}{month}{sep}{year}",
24
+ f"(?<=sáng)\s+{day}{sep}{month}",
25
+ f"(?<=sáng)\s+{day}{sep}{month}{sep}{year}",
26
+ f"(?<=trưa)\s+{day}{sep}{month}",
27
+ f"(?<=trưa)\s+{day}{sep}{month}{sep}{year}",
28
+ f"(?<=chiều)\s+{day}{sep}{month}",
29
+ f"(?<=chiều)\s+{day}{sep}{month}{sep}{year}",
30
+ f"(?<=tối)\s+{day}{sep}{month}",
31
+ f"(?<=tối)\s+{day}{sep}{month}{sep}{year}"
32
+ f"(?<=đêm)\s+{day}{sep}{month}",
33
+ f"(?<=đêm)\s+{day}{sep}{month}{sep}{year}",
34
+ f"(?<=hôm)\s+{day}{sep}{month}",
35
+ f"(?<=hôm)\s+{day}{sep}{month}{sep}{year}",
36
+ f"{day}{sep}{month}[\s\.\,\)]"
37
+ ]
38
+
39
+
40
+ def extract_pattern(text: str, patterns: List[str]):
41
+ detected = []
42
+ for pattern in patterns:
43
+ for match in re.finditer(pattern, text):
44
+ detected.append((match.start(), match.end()))
45
+ detected.sort()
46
+ output = []
47
+ curr = -1
48
+ for start, values in groupby(detected, key=operator.itemgetter(0)):
49
+ if start < curr:
50
+ continue
51
+ values = list(values)
52
+ values.sort(key=operator.itemgetter(1), reverse=True)
53
+ output.append(values[0])
54
+ curr = values[0][1]
55
+ return output
56
+
57
+
58
+ def detect_time(text: str, language: str = 'vi', base: Optional[datetime.datetime] = None):
59
+ text = text.lower()
60
+ detected_patterns = extract_pattern(text, patterns)
61
+ output = []
62
+ settings = {
63
+ 'PREFER_DAY_OF_MONTH': 'first'
64
+ }
65
+ if base:
66
+ settings['RELATIVE_BASE'] = base
67
+ for start, end in detected_patterns:
68
+ segment = text[start:end]
69
+ segment = re.sub('\s+', ' ', segment).strip().lower()
70
+ candiate = parse(segment, languages=[language], settings=settings)
71
+ output.append((segment, candiate))
72
+ return output
73
+
74
+
75
+ def get_time_post(sentences, patterns, start_time=None, end_time=None):
76
+ dict_time_evs = {}
77
+ for i, sen in enumerate(sentences):
78
+ if sen.strip() != "":
79
+ time_ex = detect_time(sen, patterns)
80
+ for te in time_ex:
81
+ if te[1] is not None:
82
+ if start_time is None or end_time is None or (end_time > te[1].timestamp() > start_time):
83
+ if te not in dict_time_evs:
84
+ dict_time_evs[te] = []
85
+ dict_time_evs[te].append(i)
86
+ return dict_time_evs
87
+
88
+
89
+ if __name__ == '__main__':
90
+ print(detect_time("VietTimes – Ngoại trưởng Mỹ Antony Blinken ngày đã tuyên bố trong một cuộc họp qua"
91
+ "truyền hình với ngoại trưởng các nước ASEAN Mỹ bác bỏ các yêu sách “bất hợp pháp” của"
92
+ "Trung Quốc ở Biển Đông.", language="vi"))
function/embed_vncorenlp.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import jnius_config
2
+ import os
3
+ import shutil
4
+
5
+ save_dir = "/home2/vietle/icgpt/vncorenlp-1.2"
6
+ max_heap_size='-Xmx4g'
7
+ jnius_config.add_options(max_heap_size)
8
+ jnius_config.set_classpath(save_dir + "/VnCoreNLP-1.2.jar")
9
+
10
+ def download_model(save_dir='./'):
11
+ # current_path = os.path.abspath(os.getcwd())
12
+ if save_dir[-1] == '/':
13
+ save_dir = save_dir[:-1]
14
+ if os.path.isdir(save_dir + "/models") and os.path.exists(save_dir + '/VnCoreNLP-1.2.jar'):
15
+ print("VnCoreNLP model folder " + save_dir + " already exists! Please load VnCoreNLP from this folder!")
16
+ else:
17
+ os.mkdir(save_dir + "/models")
18
+ os.mkdir(save_dir + "/models/dep")
19
+ os.mkdir(save_dir + "/models/ner")
20
+ os.mkdir(save_dir + "/models/postagger")
21
+ os.mkdir(save_dir + "/models/wordsegmenter")
22
+ # jar
23
+ os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.2.jar")
24
+ shutil.move("VnCoreNLP-1.2.jar", save_dir + "/VnCoreNLP-1.2.jar")
25
+ # wordsegmenter
26
+ os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab")
27
+ os.system(
28
+ "wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr")
29
+ shutil.move("vi-vocab", save_dir + "/models/wordsegmenter/vi-vocab")
30
+ shutil.move("wordsegmenter.rdr", save_dir + "/models/wordsegmenter/wordsegmenter.rdr")
31
+ # postagger
32
+ os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/postagger/vi-tagger")
33
+ shutil.move("vi-tagger", save_dir + "/models/postagger/vi-tagger")
34
+ # ner
35
+ os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-500brownclusters.xz")
36
+ os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-ner.xz")
37
+ os.system(
38
+ "wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/ner/vi-pretrainedembeddings.xz")
39
+ shutil.move("vi-500brownclusters.xz", save_dir + "/models/ner/vi-500brownclusters.xz")
40
+ shutil.move("vi-ner.xz", save_dir + "/models/ner/vi-ner.xz")
41
+ shutil.move("vi-pretrainedembeddings.xz", save_dir + "/models/ner/vi-pretrainedembeddings.xz")
42
+ # parse
43
+ os.system("wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/dep/vi-dep.xz")
44
+ shutil.move("vi-dep.xz", save_dir + "/models/dep/vi-dep.xz")
45
+
46
+
47
+ class VnCoreNLP:
48
+ def __init__(self, annotators=["wseg", "pos", "ner", "parse"], save_dir = './'):
49
+ if save_dir[-1] == '/':
50
+ save_dir = save_dir[:-1]
51
+ if os.path.isdir(save_dir + "/models") == False or os.path.exists(save_dir + '/VnCoreNLP-1.2.jar') == False:
52
+ raise Exception("Please download the VnCoreNLP model!")
53
+ self.current_working_dir = os.getcwd()
54
+ os.chdir(save_dir)
55
+
56
+ from jnius import autoclass
57
+ javaclass_vncorenlp = autoclass('vn.pipeline.VnCoreNLP')
58
+ self.javaclass_String = autoclass('java.lang.String')
59
+ self.annotators = annotators
60
+ if "wseg" not in annotators:
61
+ self.annotators.append("wseg")
62
+
63
+ self.model = javaclass_vncorenlp(annotators)
64
+
65
+ def annotate_text(self, text):
66
+ from jnius import autoclass
67
+ javaclass_Annotation = autoclass('vn.pipeline.Annotation')
68
+ str = self.javaclass_String(text)
69
+ annotation = javaclass_Annotation(str)
70
+ self.model.annotate(annotation)
71
+ dict_sentences = {}
72
+ list_sentences = annotation.toString().split("\n\n")[:-1]
73
+ for i in range(len(list_sentences)):
74
+ list_words = list_sentences[i].split("\n")
75
+ list_dict_words = []
76
+ for word in list_words:
77
+ dict_word = {}
78
+ word = word.replace("\t\t", "\t")
79
+ list_tags = word.split("\t")
80
+ dict_word["index"] = int(list_tags[0])
81
+ dict_word["wordForm"] = list_tags[1]
82
+ dict_word["posTag"] = list_tags[2]
83
+ dict_word["nerLabel"] = list_tags[3]
84
+ if "parse" in self.annotators:
85
+ dict_word["head"] = int(list_tags[4])
86
+ else:
87
+ dict_word["head"] = list_tags[4]
88
+ dict_word["depLabel"] = list_tags[5]
89
+ list_dict_words.append(dict_word)
90
+ dict_sentences[i] = list_dict_words
91
+ return dict_sentences
92
+
93
+ def tokenize(self, text):
94
+ annotated_sens = self.annotate_text(text=text)
95
+ output = []
96
+ for id_sen in annotated_sens:
97
+ annotated_sen = annotated_sens[id_sen]
98
+ out = [s["wordForm"] for s in annotated_sen]
99
+ output.append(out)
100
+ return output
101
+
102
+ def pos_tag(self, text):
103
+ annotated_sens = self.annotate_text(text=text)
104
+ output = []
105
+ for id_sen in annotated_sens:
106
+ annotated_sen = annotated_sens[id_sen]
107
+ out = [(s["wordForm"], s["posTag"]) for s in annotated_sen]
108
+ output.append(out)
109
+ return output
110
+
111
+ def ner(self, text):
112
+ annotated_sens = self.annotate_text(text=text)
113
+ output = []
114
+ for id_sen in annotated_sens:
115
+ annotated_sen = annotated_sens[id_sen]
116
+ out = [(s["wordForm"], s["nerLabel"]) for s in annotated_sen]
117
+ output.append(out)
118
+ return output
119
+
120
+ def word_segment(self, text):
121
+ from jnius import autoclass
122
+ javaclass_Annotation = autoclass('vn.pipeline.Annotation')
123
+ str = self.javaclass_String(text)
124
+ annotation = javaclass_Annotation(str)
125
+ self.model.annotate(annotation)
126
+ list_segmented_sentences = []
127
+ list_sentences = annotation.toString().split("\n\n")[:-1]
128
+ for sent in list_sentences:
129
+ list_words = sent.split("\n")
130
+ list_segmented_words = []
131
+ for word in list_words:
132
+ word = word.replace("\t\t", "\t")
133
+ list_tags = word.split("\t")
134
+ list_segmented_words.append(list_tags[1])
135
+ list_segmented_sentences.append(" ".join(list_segmented_words))
136
+ return list_segmented_sentences
137
+
138
+ def print_out(self, dict_sentences):
139
+ for sent in dict_sentences.keys():
140
+ list_dict_words = dict_sentences[sent]
141
+ for word in list_dict_words:
142
+ print(str(word["index"]) + "\t" + word["wordForm"] + "\t" + word["posTag"] + "\t" + word["nerLabel"] + "\t" + str(word["head"]) + "\t" + word["depLabel"])
143
+ print("")
144
+
145
+ def annotate_file(self, input_file, output_file):
146
+ os.chdir(self.current_working_dir)
147
+ input_str = self.javaclass_String(input_file)
148
+ output_str = self.javaclass_String(output_file)
149
+ self.model.processPipeline(input_str, output_str, self.annotators)
150
+
151
+ if __name__ == '__main__':
152
+ download_model(save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
153
+ model = VnCoreNLP(annotators=["wseg","pos","ner"], save_dir='/home2/vietle/icgpt/vncorenlp-1.2')
154
+ # output = model.annotate_text("Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây.")
155
+ # print(output)
156
+
157
+ text = "Sau khi tốt nghiệp Trung học năm 1975, ông theo học dự bị Ngoại ngữ tại Đại học Ngoại ngữ (nay là Trường Đại học Hà Nội)."
158
+ out = model.tokenize(text)
159
+ print(out)
160
+ # model.print_out(output)
161
+
function/sentence_embbeding.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+
4
+ URL_EMBBED_ZH = "http://10.9.3.239:1999/api/v1/extract_feature_zh"
5
+ URL_EMBBED_EN = "http://10.9.3.239:1999/api/v1/extract_feature_en"
6
+ URL_EMBBED_BGE = "http://10.9.3.240:5045/api/v1/embedding"
7
+
8
+ def embbeded_zh(text: list):
9
+ try:
10
+ r = requests.post(URL_EMBBED_ZH, data = json.dumps({
11
+ "text": text
12
+ }))
13
+ embs = r.json()["vectors"]
14
+ return embs
15
+ except Exception as ve:
16
+ print(ve)
17
+ return []
18
+
19
+
20
+ def embbeded_en(text: list):
21
+ try:
22
+ r = requests.post(URL_EMBBED_EN, data = json.dumps({
23
+ "text": text
24
+ }))
25
+ embs = r.json()["vectors"]
26
+ return embs
27
+ except Exception as ve:
28
+ print(ve)
29
+ return []
30
+
31
+
32
+ def embedded_bge(text: list):
33
+ try:
34
+ r = requests.post(URL_EMBBED_BGE, data = json.dumps({
35
+ "text": text
36
+ }))
37
+ embs = r.json()["embeddings"]
38
+ return embs
39
+ except Exception as ve:
40
+ print(ve)
41
+ return []
function/summary_with_llm.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import requests
3
+ import nltk
4
+ import re
5
+ import time
6
+
7
+ nltk.download('punkt')
8
+
9
+
10
+ URL_LLAMA = "http://10.9.3.241:8022/api/v1/llama/QnA"
11
+ VERSION = {
12
+ "en-llama": "mistral-full", #"llama2-7b",
13
+ "vi-llama": "mistral-full" #"ic-llama-68k"
14
+ }
15
+
16
+ SYSTEM_PROMPT = ""
17
+ POST_FIX_Q = ""
18
+
19
+ SYSTEM_PROMPT_CHECK_RELEVANT = "Bạn là trợ lý AI giúp mọi người tìm kiếm thông tin. Người dùng sẽ cung cấp cho bạn một câu hỏi. Nhiệm vụ của bạn là trả lời trung thực nhất có thể."
20
+ PROMPT_RELEVANT = 'Câu trả lời dưới có liên quan đến câu hỏi "{0}" hay không?'
21
+
22
+
23
+ REMOVE_WORDS = []
24
+ with open("data/remove.txt", 'r') as f:
25
+ for line in f:
26
+ line = line.strip()
27
+ if line:
28
+ REMOVE_WORDS.append(line.lower())
29
+ print(REMOVE_WORDS)
30
+
31
+
32
+ def check_answer_contains_remove_words(text):
33
+ text = text.lower()
34
+ text = re.sub(r'\s+ ', ' ', text)
35
+ lst_sen = nltk.sent_tokenize(text)
36
+ for sen in lst_sen:
37
+ for pat in REMOVE_WORDS:
38
+ if re.search(pat, sen) is not None:
39
+ return True
40
+ return False
41
+
42
+
43
+ def normalize_text_after_qna_llm(text, prompt: str = ""):
44
+
45
+ text = re.sub("^Dựa trên.{1,60}cung cấp,", "", text.strip(), flags=re.I).strip()
46
+ text = re.sub("^Dự báo.{1,60}là", "", text, flags=re.I).strip()
47
+ text = re.sub("^.{1,15}dựa trên.{1,60}cung cấp,", "", text, flags=re.I).strip()
48
+ text = re.sub("^.{1,15}theo thông tin.{1,60}cung cấp,", "", text, flags=re.I).strip()
49
+ # if len(prompt) > 10:
50
+ # print("pattern: ", "^" + prompt[:10] + ".{1,40}là")
51
+ # text = re.sub("^" + prompt[:10] + ".{1,40}là", "", text, flags=re.I).strip()
52
+ if text and text[0].islower():
53
+ text = text[0].upper() + text[1:]
54
+ return text
55
+
56
+
57
+ def created_context(text, max_word=2048):
58
+ text = re.sub(r'\s+ ', ' ', text)
59
+ lst_sen = nltk.sent_tokenize(text)
60
+ count_w = 0
61
+ contexts = []
62
+ lst_sen_per_context = []
63
+ for sen in lst_sen:
64
+ count_w += sen.count(" ")
65
+ lst_sen_per_context.append(sen)
66
+ if count_w > max_word:
67
+ contexts.append(" ".join(lst_sen_per_context))
68
+ lst_sen_per_context = []
69
+ count_w = 0
70
+ if lst_sen_per_context:
71
+ contexts.append(" ".join(lst_sen_per_context))
72
+ return contexts
73
+
74
+
75
+ def get_icgpt_result(questions, contexts, lang, version="vi-llama", system_prompt:str = "", id_cluster="", hash_str: str = ""):
76
+ json_body = {
77
+ "questions": questions,
78
+ "contexts": contexts,
79
+ "lang": lang,
80
+ "version": VERSION[version],
81
+ "system_prompt": system_prompt
82
+ }
83
+ try:
84
+
85
+ res = requests.post(URL_LLAMA, json=json_body)
86
+ with open(f"log_llm/requests_llama/{hash_str}_{id_cluster}.txt", "w") as f:
87
+ f.write(json.dumps(json_body) + "\n")
88
+ f.write(json.dumps(res.json()) + "\n" + "$"*50)
89
+ res = res.json()["result"]
90
+ res = [x.replace("_", " ") for x in res]
91
+ return res
92
+ except Exception as ex:
93
+ print(f"[ERROR] get_icgpt_official_visit: {str(ex)}")
94
+ with open(f"log_llm/requests_llama_error/{hash_str}_{id_cluster}.txt", "w") as f:
95
+ f.write(json.dumps(json_body) + "\n")
96
+ return None
97
+
98
+
99
+ def check_relevant_with_llama(question, content, lang, version="en-llama", max_word_per_context=1024, id_cluster="", hash_str: str = ""):
100
+ contexts = created_context(content, max_word=max_word_per_context)
101
+ content = contexts[0] if contexts else ""
102
+ answer = ""
103
+ question_all = question.split("#####")
104
+ check_rel = True
105
+ for question in question_all:
106
+ question = question.strip()
107
+ # question = PROMPT_RELEVANT.format(question)
108
+ question_split = question.split("$$$$")
109
+ if len(question_split) < 2:
110
+ question = question
111
+ system_prompt = ""
112
+ else:
113
+ system_prompt = question_split[0].strip()
114
+ question = question_split[1].strip()
115
+
116
+ # system_prompt = SYSTEM_PROMPT_CHECK_RELEVANT
117
+ if content:
118
+ contexts = [content]
119
+ questions = [question]
120
+ lst_res = get_icgpt_result(questions, contexts, lang, version=version, system_prompt=system_prompt, id_cluster=f"{id_cluster}_recheck_relevant", hash_str=hash_str)
121
+ if lst_res is None:
122
+ lst_res = []
123
+ check_rel = True
124
+ # continue
125
+ # break
126
+ for an in lst_res:
127
+ if an:
128
+ is_relevant = check_answer_contains_remove_words(an)
129
+ answer = normalize_text_after_qna_llm(an, prompt=question)
130
+ if len(answer) == 0:
131
+ check_rel = True
132
+ # continue
133
+ # break
134
+ else:
135
+ check_rel = is_relevant
136
+ if not check_rel:
137
+ break
138
+ if not check_rel:
139
+ break
140
+ else:
141
+ check_rel = True
142
+ return check_rel
143
+
144
+
145
+ def summary_with_llama(question, content, lang, version="vi-llama", max_word_per_context=1024, id_cluster="", hash_str: str = ""):
146
+ contexts = created_context(content, max_word=max_word_per_context)
147
+ question_split = question.split("$$$$")
148
+ if len(question_split) < 2:
149
+ question = question
150
+ system_prompt = ""
151
+ else:
152
+ system_prompt = question_split[0].strip()
153
+ question = question_split[1].strip()
154
+
155
+ if question.strip().endswith(".") or question.strip().endswith("?"):
156
+ question = (question + POST_FIX_Q).strip()
157
+ else:
158
+ question = (question + ". " + POST_FIX_Q).strip()
159
+ answer = ""
160
+ if contexts:
161
+ contexts = [contexts[0]]
162
+ questions = [question] * len(contexts)
163
+ lst_res = get_icgpt_result(questions, contexts, lang, version=version, system_prompt=system_prompt, id_cluster=id_cluster, hash_str=hash_str)
164
+ results = []
165
+ if lst_res is None:
166
+ lst_res = []
167
+ for an in lst_res:
168
+
169
+ is_relevant = check_answer_contains_remove_words(an)
170
+
171
+ if not is_relevant:
172
+ results.append(an)
173
+ # else:
174
+ # print("ans_from_llama-preview: ", an, is_relevant)
175
+ if results:
176
+ if len(results) > 1:
177
+ context_combine = [". ".join(results)]
178
+ res = get_icgpt_result([question], context_combine, lang, version=version, system_prompt=system_prompt, id_cluster=id_cluster, hash_str=hash_str)
179
+ if res:
180
+ answer = res[0]
181
+ else:
182
+ answer = results[0]
183
+ else:
184
+ if lst_res:
185
+ return "", True
186
+ if answer:
187
+ is_relevant = check_answer_contains_remove_words(answer)
188
+ # print("ans_from_llama-before: ", answer, is_relevant)
189
+ answer = normalize_text_after_qna_llm(answer, prompt=question)
190
+
191
+ if len(answer) == 0:
192
+ is_relevant = True
193
+ # print("ans_from_llama-after: ", answer, is_relevant)
194
+ return answer, is_relevant
195
+
196
+
197
+ if __name__ == '__main__':
198
+ text = """ Dựa trên thông tin được cung cấp, xung đột ở Biển Đông đã gia tăng và có thể xuất hiện điểm xung đột mới giữa Philippines và Trung Quốc. Philippines đang xem xét xây dựng một ngọn hải đăng trên Bãi cạn Second Thomas (còn gọi là Bãi cạn Second Thomas), điều này đã khiến Trung Quốc tuyên bố rằng đây là một hành động vi phạm nghiêm trọng chủ quyền của họ và sẽ đáp trả kiên quyết. Giới chuyên gia cho rằng điểm bùng phát xung đột mới có thể xảy ra giữa Philippines và Trung nước ở Biển Đông. Tham vọng hung hăng của Trung Quốc đã thúc đẩy căng thẳng khu vực và các tranh chấp gần đây về vụ va chạm gần đây của các tàu gần Bãi Cỏ Mây (Second Thomas Shoal) ở Biển Đông đã làm căng th ng giữa Trung Quốc và Philippines gia tăng. """
199
+ is_re = check_answer_contains_remove_words(text)
200
+ print(is_re)
201
+ an = normalize_text_after_qna_llm(text)
202
+ print(an)
203
+ # exit(1)
204
+ # message = "G\u1EA7n \u0111\u00E2y, Philippines \u0111\u00E3 nhi\u1EC1u l\u1EA7n g\u00E2y ra r\u1EAFc r\u1ED1i \u1EDF Bi\u1EC3n \u0110\u00F4ng, x\u00E2m nh\u1EADp \u0110\u00E1 Nh\u00E2n \u00C1i thu\u1ED9c qu\u1EA7n \u0111\u1EA3o Nam Sa c\u1EE7a Trung Qu\u1ED1c, \u0111\u1ED3ng th\u1EDDi ti\u1EBFp t\u1EE5c ph\u00F3ng \u0111\u1EA1i v\u00E0 c\u01B0\u1EDDng \u0111i\u1EC7u h\u00F3a, nh\u1EA7m l\u1EABn \u0111\u00FAng sai v\u00E0 \u0111\u01B0a ra nh\u1EEFng c\u00E1o bu\u1ED9c v\u00F4 l\u00FD ch\u1ED1ng l\u1EA1i Trung Qu\u1ED1c. \\n Ng\u01B0\u1EDDi Philippines ngh\u0129 g\u00EC v\u1EC1 h\u00E0nh \u0111\u1ED9ng c\u1EE7a ch\u00EDnh ph\u1EE7 Philippines? \\ n \u00D4ng Tan \u0111\u00E3 t\u00ECm ra Herman Laurel, m\u1ED9t nh\u00E0 b\u00ECnh lu\u1EADn ch\u00EDnh tr\u1ECB n\u1ED5i ti\u1EBFng \u1EDF Philippines, \u0111\u1ED3ng th\u1EDDi l\u00E0 gi\u00E1m \u0111\u1ED1c Vi\u1EC7n nghi\u00EAn c\u1EE9u chi\u1EBFn l\u01B0\u1EE3c \"Th\u1EBF k\u1EF7 ch\u00E2u \u00C1\" c\u1EE7a Philippines v\u00E0 l\u00E0 ng\u01B0\u1EDDi s\u00E1ng l\u1EADp Hi\u1EC7p h\u1ED9i nghi\u00EAn c\u1EE9u ch\u00EDnh s\u00E1ch BRICS c\u1EE7a Philippines. Herman Laurel, ng\u01B0\u1EDDi \u0111ang nghi\u00EAn c\u1EE9u v\u1EC1 ch\u00E2u \u00C1, \u0111\u1EB7c bi\u1EC7t l\u00E0 m\u1ED1i quan h\u1EC7 gi\u1EEFa Trung Qu\u1ED1c v\u00E0 ASEAN, chia s\u1EBB quan s\u00E1t c\u1EE7a m\u00ECnh. \\n 01 \\n Tan Zhu: K\u1EC3 t\u1EEB n\u1EEDa cu\u1ED1i n\u0103m nay, Philippines th\u01B0\u1EDDng xuy\u00EAn x\u00E2m chi\u1EBFm c\u00E1c \u0111\u1EA3o v\u00E0 b\u00E3i \u0111\u00E1 c\u1EE7a ch\u00FAng t\u00F4i \u1EDF Bi\u1EC3n \u0110\u00F4ng, l\u00E0m gia t\u0103ng c\u0103ng th\u1EB3ng \u1EDF Bi\u1EC3n \u0110\u00F4ng, \u0111\u1ED3ng th\u1EDDi vu kh\u1ED1ng Trung Qu\u1ED1c l\u00E0 k\u1EBB b\u1EAFt n\u1EA1t l\u1EDBn. , c\u00F3 nh\u1EEFng nh\u00F3m \u1EDF Philippines ph\u1EA3n \u0111\u1ED1i h\u00E0nh \u0111\u1ED9ng khi\u00EAu kh\u00EDch c\u1EE7a Philippines. B\u1EA1n c\u1EA3m th\u1EA5y th\u1EBF n\u00E0o v\u1EC1 \u0111i\u1EC1u n\u00E0y v\u1EDBi t\u01B0 c\u00E1ch l\u00E0 m\u1ED9t ng\u01B0\u1EDDi Philippines? Herman Laurel: T\u00F4i cho r\u1EB1ng C\u1EA3nh s\u00E1t bi\u1EC3n Philippines v\u00E0 m\u1ED9t s\u1ED1 t\u00E0u d\u00E2n s\u1EF1 Philippines \u0111\u00E3 c\u1ED1 t\u00ECnh g\u00E2y r\u1EAFc r\u1ED1i v\u00EC h\u1ECD \u0111ang th\u1EF1c hi\u1EC7n \u201CD\u1EF1 \u00E1n t\u1ED5ng th\u1EC3\u201D c\u1EE7a Hoa K\u1EF3. \\nM\u1EF9 khoe khoang d\u1EF1 \u00E1n n\u00E0y v\u1EDBi ch\u00FAng t\u00F4i n\u00EAn ch\u00FAng t\u00F4i bi\u1EBFt t\u00EAn d\u1EF1 \u00E1n. D\u1EF1 \u00E1n \u0111\u01B0\u1EE3c d\u1EABn d\u1EAFt b\u1EDFi \u0110\u1EA1i t\u00E1 Kh\u00F4ng qu\u00E2n Hoa K\u1EF3 \u0111\u00E3 ngh\u1EC9 h\u01B0u Raimundo Powell, l\u00E0m vi\u1EC7c v\u1EDBi c\u00E1c \u0111\u1ED1i t\u00E1c Philippines nh\u01B0 C\u1EA3nh s\u00E1t bi\u1EC3n Philippines v\u00E0 m\u1ED9t s\u1ED1 quan ch\u1EE9c c\u1EE7a B\u1ED9 Ngo\u1EA1i giao Philippines. H\u1ECD l\u00EAn k\u1EBF ho\u1EA1ch, \u00E2m m\u01B0u v\u00E0 th\u1EF1c hi\u1EC7n d\u1EF1 \u00E1n n\u00E0y nh\u1EB1m \u0111\u1ED1i \u0111\u1EA7u, khi\u00EAu kh\u00EDch v\u00E0 \u0111\u1ED1i \u0111\u1EA7u v\u1EDBi Qu\u00E2n \u0111o\u00E0n C\u1EA3nh s\u00E1t bi\u1EC3n thu\u1ED9c L\u1EF1c l\u01B0\u1EE3ng C\u1EA3nh s\u00E1t V\u0169 trang Nh\u00E2n d\u00E2n Trung Qu\u1ED1c. T\u1EA5t c\u1EA3 nh\u1EEFng \u0111i\u1EC1u n\u00E0y \u0111\u1EC1u nh\u1EB1m m\u1EE5c \u0111\u00EDch c\u1ED1 t\u00ECnh t\u1EA1o ra c\u0103ng th\u1EB3ng \u1EDF Bi\u1EC3n \u0110\u00F4ng, \u0111\u1EB7c bi\u1EC7t l\u00E0 t\u1EA1i c\u00E1c khu v\u1EF1c tranh ch\u1EA5p gi\u1EEFa Philippines v\u00E0 Trung Qu\u1ED1c nh\u01B0 B\u00E3i c\u1EA1n Scarborough hay B\u00E3i c\u1EA1n Second Thomas. \\n \\n \\n \\n \\n Vi\u1EC7c Trung Qu\u1ED1c s\u1EED d\u1EE5ng t\u00E0u l\u1EDBn v\u00E0 t\u00E0u chi\u1EBFn l\u00E0 \u0111i\u1EC1u t\u1ED1t. L\u1EE3i th\u1EBF v\u1EC1 s\u1ED1 l\u01B0\u1EE3ng v\u00E0 quy m\u00F4 c\u1EE7a L\u1EF1c l\u01B0\u1EE3ng C\u1EA3nh s\u00E1t bi\u1EC3n thu\u1ED9c L\u1EF1c l\u01B0\u1EE3ng C\u1EA3nh s\u00E1t V\u0169 trang Nh\u00E2n d\u00E2n Trung Qu\u1ED1c th\u1EF1c s\u1EF1 tr\u00E1nh \u0111\u01B0\u1EE3c m\u1ED9t cu\u1ED9c \u0111\u1ED1i \u0111\u1EA7u th\u1EF1c s\u1EF1 nghi\u00EAm tr\u1ECDng h\u01A1n, b\u1EDFi v\u00EC n\u1EBFu hai b\u00EAn c\u00F3 quy m\u00F4 ngang nhau, M\u1ED9t cu\u1ED9c xung \u0111\u1ED9t h\u1EA1m \u0111\u1ED9i c\u00F3 th\u1EC3 tr\u1EDF n\u00EAn s\u1EAFp x\u1EA3y ra. \\ n Nh\u01B0ng d\u00F9 th\u1EBF n\u00E0o \u0111i n\u1EEFa, c\u00E1c ph\u01B0\u01A1ng ti\u1EC7n truy\u1EC1n th\u00F4ng ph\u01B0\u01A1ng T\u00E2y s\u1EBD m\u00F4 t\u1EA3 Trung Qu\u1ED1c l\u00E0 k\u1EBB b\u1EAFt n\u1EA1t. \u0110\u00E2y l\u00E0 m\u1EE5c \u0111\u00EDch th\u1EF1c s\u1EF1 c\u1EE7a d\u1EF1 \u00E1n n\u00E0y v\u00E0 l\u00E0 m\u1EE5c ti\u00EAu m\u00E0 Hoa K\u1EF3 \u0111\u1EB7t ra - h\u1ECD s\u1EED d\u1EE5ng c\u00E1i g\u1ECDi l\u00E0 \"ni\u1EC1m tin v\u00E0o t\u00EDnh minh b\u1EA1ch c\u1EE7a b\u00E1o c\u00E1o\" \u0111\u1EC3 c\u1EA5u th\u00E0nh m\u1ED9t \"\u00E2m m\u01B0u \u0111en t\u1ED1i\" ch\u1ED1ng l\u1EA1i Trung Qu\u1ED1c... c\u00F4ng khai\". \u0110\u01B0\u1EE3c cho l\u00E0 minh b\u1EA1ch v\u00EC truy\u1EC1n th\u00F4ng ph\u01B0\u01A1ng T\u00E2y \u0111\u00E3 mang theo camera, nh\u00E2n vi\u00EAn truy\u1EC1n th\u00F4ng... \u0111\u1EC3 ghi l\u1EA1i qu\u00E1 tr\u00ECnh n\u00E0y v\u00E0 \u0111\u01B0a tin cho kh\u00E1n gi\u1EA3 Philippines v\u00E0 th\u1EBF gi\u1EDBi. \\nHoa K\u1EF3 \u0111\u00E3 ch\u1EC9 ra trong \u0111\u1EC1 xu\u1EA5t d\u1EF1 \u00E1n r\u1EB1ng m\u1EE5c ti\u00EAu c\u1EE7a h\u1ECD l\u00E0 khi\u1EBFn Trung Qu\u1ED1c ph\u1EA3i tr\u1EA3 gi\u00E1. T\u00F4i ngh\u0129 m\u1ECDi ng\u01B0\u1EDDi \u0111\u1EC1u c\u00F3 th\u1EC3 hi\u1EC3u \u0111i\u1EC1u n\u00E0y c\u00F3 ngh\u0129a l\u00E0 g\u00EC - Hoa K\u1EF3 mu\u1ED1n h\u1EE7y ho\u1EA1i danh ti\u1EBFng c\u1EE7a Trung Qu\u1ED1c. \\n C\u00F3 m\u1ED9t th\u1EDDi, Philippines c\u00F3 m\u1ED1i quan h\u1EC7 r\u1EA5t h\u1EEFu \u00EDch v\u00E0 th\u00E2n thi\u1EC7n v\u1EDBi Trung Qu\u1ED1c. C\u00F3 nhi\u1EC1u c\u1EA7u n\u1ED1i h\u1EE3p t\u00E1c gi\u1EEFa ch\u00FAng ta, ch\u1EB3ng h\u1EA1n nh\u01B0 c\u00E1c d\u1EF1 \u00E1n h\u1EE3p t\u00E1c trong vi\u1EC7c c\u00F9ng x\u00E2y d\u1EF1ng s\u00E1ng ki\u1EBFn \u200B\u200B\u201CV\u00E0nh \u0111ai v\u00E0 Con \u0111\u01B0\u1EDDng\u201D, gi\u00FAp t\u0103ng th\u00EAm l\u1EE3i th\u1EBF kinh t\u1EBF c\u1EE7a Philippines. \\n Tuy nhi\u00EAn, T\u1ED5ng th\u1ED1ng Philippines Marcos r\u00F5 r\u00E0ng c\u00F3 quan \u0111i\u1EC3m ri\u00EAng v\u1EC1 s\u1EF1 d\u00E0n x\u1EBFp c\u1EE7a M\u1EF9, \u00F4ng ch\u01B0a bao gi\u1EDD gi\u1EA3i th\u00EDch cho ng\u01B0\u1EDDi d\u00E2n Philippines l\u00FD do quay tr\u1EDF l\u1EA1i v\u1EDBi M\u1EF9. V\u1EC1 v\u1EA5n \u0111\u1EC1 Bi\u1EC3n \u0110\u00F4ng, ch\u00EDnh s\u00E1ch c\u1EE7a Marcos c\u00E0ng th\u00F9 \u0111\u1ECBch v\u00E0 hung h\u0103ng h\u01A1n. \\n Tuy nhi\u00EAn, ng\u01B0\u1EDDi d\u00E2n Philippines kh\u00F4ng quan t\u00E2m \u0111\u1EBFn tranh ch\u1EA5p Bi\u1EC3n \u0110\u00F4ng. \\n \"Asia Pulse\" c\u00F3 th\u1EC3 n\u00F3i l\u00E0 m\u1ED9t t\u1ED5 ch\u1EE9c b\u1ECF phi\u1EBFu, m\u1EE5c \u0111\u00EDch l\u00E0 t\u00ECm hi\u1EC3u mong mu\u1ED1n c\u1EE7a ng\u01B0\u1EDDi d\u00E2n Ch\u00E2u \u00C1. V\u00E0o th\u00E1ng 9 n\u0103m 2023, m\u1ED9t b\u00E1o c\u00E1o th\u0103m d\u00F2 \u00FD ki\u1EBFn \u200B\u200Bcho th\u1EA5y ch\u1EC9 c\u00F3 7% ng\u01B0\u1EDDi d\u00E2n Philippines quan t\u00E2m \u0111\u1EBFn tranh ch\u1EA5p Bi\u1EC3n \u0110\u00F4ng, trong khi 93% ng\u01B0\u1EDDi d\u00E2n th\u1EF1c s\u1EF1 ch\u1EC9 quan t\u00E2m \u0111\u1EBFn gi\u00E1 c\u1EA3 cao, l\u1EA1m ph\u00E1t, vi\u1EC7c l\u00E0m, v\u1EA5n \u0111\u1EC1 t\u1ED9i ph\u1EA1m gia t\u0103ng v\u00E0 v\u1EA5n \u0111\u1EC1 ma t\u00FAy, v.v. . . \\nGi\u1EDD \u0111\u00E2y, ngay c\u1EA3 gia \u0111\u00ECnh Marcos c\u0169ng b\u1ECB chia r\u1EBD. Em g\u00E1i c\u1EE7a Marcos, v\u1EDBi t\u01B0 c\u00E1ch l\u00E0 ch\u1EE7 t\u1ECBch \u1EE6y ban \u0110\u1ED1i ngo\u1EA1i Th\u01B0\u1EE3ng vi\u1EC7n Philippines, \u0111\u00E3 ph\u1EA3n \u0111\u1ED1i vi\u1EC7c Marcos quay sang Hoa K\u1EF3 v\u00E0 h\u00E0nh vi hung h\u0103ng c\u1EE7a Marcos \u1EDF ch\u00E2u \u00C1, \u0111\u1ED3ng th\u1EDDi ch\u1EC9 tr\u00EDch vi\u1EC7c tri\u1EC3n khai c\u00E1c c\u0103n c\u1EE9 c\u1EE7a M\u1EF9 \u1EDF Philippines v\u00E0 nh\u1EEFng c\u0103ng th\u1EB3ng do Hoa K\u1EF3 g\u00E2y ra \u1EDF Philippines. Bi\u1EC3n \u0110\u00F4ng. . \\n 02 \\n Tan Zhu: Sau khi th\u1ED5i ph\u1ED3ng s\u1EF1 c\u1ED1 \u0110\u00E1 Nh\u00E2n \u00C1i, kh\u00F4ng qu\u1ED1c gia n\u00E0o \u1EE7ng h\u1ED9 Philippines l\u00E0 qu\u1ED1c gia ASEAN. Ph\u1EA3i ch\u0103ng \u0111i\u1EC1u n\u00E0y c\u00F3 ngh\u0129a l\u00E0 Philippines \u0111\u00E3 b\u1ECB c\u00F4 l\u1EADp \u1EDF c\u1EA5p \u0111\u1ED9 ngo\u1EA1i giao khu v\u1EF1c do \u0111i \u0111\u1EA7u trong vi\u1EC7c g\u00E2y r\u1EAFc r\u1ED1i ? C\u00E1c n\u01B0\u1EDBc trong khu v\u1EF1c Bi\u1EC3n \u0110\u00F4ng mu\u1ED1n th\u1EA5y lo\u1EA1i Bi\u1EC3n \u0110\u00F4ng nh\u01B0 th\u1EBF n\u00E0o? \\n Herman Laurel: T\u00F4i ngh\u0129 ASEAN r\u00F5 r\u00E0ng kh\u00F4ng mu\u1ED1n c\u0103ng th\u1EB3ng leo thang. \\nTrong cu\u1ED9c ph\u1ECFng v\u1EA5n v\u1EDBi m\u1ED9t t\u1EDD b\u00E1o \u0111\u1ECBa ph\u01B0\u01A1ng \u1EDF Philippines, t\u00F4i n\u00F3i r\u1EB1ng Kh\u00F4ng qu\u00E2n Hoa K\u1EF3 v\u00E0 c\u00E1c l\u1EF1c l\u01B0\u1EE3ng qu\u00E2n s\u1EF1 kh\u00E1c \u0111\u00E3 can thi\u1EC7p v\u00E0o c\u00F4ng vi\u1EC7c c\u1EE7a Philippines \u1EDF Bi\u1EC3n \u0110\u00F4ng, v\u00E0 ASEAN kh\u00F4ng th\u00EDch s\u1EF1 can thi\u1EC7p c\u1EE7a M\u1EF9 v\u00E0o Bi\u1EC3n \u0110\u00F4ng. \\nM\u1EB7c d\u00F9 c\u00E1c n\u01B0\u1EDBc ASEAN r\u1EA5t l\u1ECBch s\u1EF1 v\u1EDBi nhau v\u00E0 kh\u00F4ng tr\u1EF1c ti\u1EBFp n\u00EAu t\u00EAn Philippines nh\u01B0ng h\u1ECD kh\u00F4ng tham gia c\u00E1c h\u00E0nh \u0111\u1ED9ng \u0111\u1ED9c l\u1EADp c\u1EE7a Philippines m\u00E0 c\u00F4 l\u1EADp Philippines. \\nT\u00F4i ngh\u0129 Th\u1EE7 t\u01B0\u1EDBng Singapore L\u00FD Hi\u1EC3n Long v\u00E0 c\u00E1c nh\u00E0 l\u00E3nh \u0111\u1EA1o kh\u00E1c \u0111\u00E3 \u0111\u01B0a ra m\u1ED9t s\u1ED1 b\u00ECnh lu\u1EADn gi\u00E1n ti\u1EBFp \u0111\u1EC1 c\u1EADp \u0111\u1EBFn Philippines v\u00E0 c\u1EA3nh b\u00E1o n\u01B0\u1EDBc n\u00E0y kh\u00F4ng n\u00EAn tr\u1EDF th\u00E0nh chi\u1EBFn tr\u01B0\u1EDDng. \u0110i\u1EC1u n\u00E0y th\u1EC3 hi\u1EC7n r\u1EA5t r\u00F5 m\u1ED1i quan ng\u1EA1i c\u1EE7a Singapore v\u00E0 truy\u1EC1n t\u1EA3i m\u1ED1i quan ng\u1EA1i t\u01B0\u01A1ng t\u1EF1 c\u1EE7a c\u00E1c n\u01B0\u1EDBc ASEAN. \\nT\u00F4i c\u0169ng \u0111ang li\u00EAn h\u1EC7 v\u1EDBi m\u1ED9t s\u1ED1 \u0111\u1EA1i s\u1EE9 qu\u00E1n ASEAN t\u1EA1i Manila. Trong cu\u1ED9c tr\u00F2 chuy\u1EC7n v\u1EDBi h\u1ECD, t\u00F4i \u0111\u00E3 nghe h\u1ECD b\u00E0y t\u1ECF m\u1ED1i quan ng\u1EA1i c\u1EE7a m\u1ED7i n\u01B0\u1EDBc v\u1EC1 c\u00E1c h\u00E0nh \u0111\u1ED9ng hi\u1EC7n t\u1EA1i c\u1EE7a Philippines \u1EDF Bi\u1EC3n \u0110\u00F4ng. \\nTr\u00EAn th\u1EF1c t\u1EBF, t\u00ECnh h\u00ECnh do Marcos v\u00E0 Hoa K\u1EF3 t\u1EA1o ra c\u0169ng \u0111\u00E3 khi\u1EBFn ng\u01B0\u1EDDi d\u00E2n c\u1EE7a ch\u00FAng t\u00F4i lo l\u1EAFng v\u1EC1 an ninh qu\u1ED1c gia. \\n Marcos v\u00E0 c\u00E1c th\u00E0nh vi\u00EAn n\u1ED9i c\u00E1c an ninh c\u1EE7a \u00F4ng \u0111\u00E3 th\u00F4ng b\u00E1o r\u1EB1ng h\u1ECD \u0111ang chuy\u1EC3n s\u1EF1 ch\u00FA \u00FD c\u1EE7a \u0111\u1EA5t n\u01B0\u1EDBc t\u1EEB an ninh n\u1ED9i b\u1ED9 sang an ninh b\u00EAn ngo\u00E0i, t\u1EE9c l\u00E0 h\u1ECD \u0111ang l\u00E0m \u0111i\u1EC1u \u0111\u00F3 m\u00E0 kh\u00F4ng th\u1EF1c s\u1EF1 t\u00ECm ra ai l\u00E0 m\u1ED1i \u0111e d\u1ECDa an ninh qu\u1ED1c gia, ho\u1EA1t \u0111\u1ED9ng \u0111\u1EC3 b\u1EA3o v\u1EC7 an ninh l\u00E3nh th\u1ED5. \\n S\u1EF1 thay \u0111\u1ED5i ch\u00EDnh s\u00E1ch n\u00E0y khi\u1EBFn m\u1ECDi ng\u01B0\u1EDDi l\u1EA7m t\u01B0\u1EDFng r\u1EB1ng ph\u1EA3i ch\u00FA \u00FD \u0111\u1EBFn an ninh b\u00EAn ngo\u00E0i, tuy nhi\u00EAn \u0111\u00E2y kh\u00F4ng ph\u1EA3i l\u00E0 m\u1ED1i \u0111e d\u1ECDa th\u1EF1c s\u1EF1 m\u00E0 ch\u1EC9 l\u00E0 m\u1ED9t \u1EA3o \u1EA3nh, khi\u1EBFn m\u1ECDi ng\u01B0\u1EDDi l\u01A1 l\u00E0 an ninh trong n\u01B0\u1EDBc v\u00E0 l\u00E0m t\u00ECnh h\u00ECnh an ninh trong n\u01B0\u1EDBc tr\u1EDF n\u00EAn t\u1ED3i t\u1EC7 h\u01A1n. \\n C\u00E1ch \u0111\u00E2y kh\u00F4ng l\u00E2u, m\u1ED9t v\u1EE5 \u0111\u00E1nh bom kh\u1EE7ng b\u1ED1 nghi\u00EAm tr\u1ECDng \u0111\u00E3 x\u1EA3y ra t\u1EA1i \u0110\u1EA1i h\u1ECDc Qu\u1ED1c gia Mindanao \u1EDF th\u00E0nh ph\u1ED1 Marawi, khi\u1EBFn nhi\u1EC1u ng\u01B0\u1EDDi Philippines thi\u1EC7t m\u1EA1ng; \u1EDF m\u1ED9t t\u1EC9nh kh\u00E1c, m\u1ED9t v\u1EE5 n\u1ED5 c\u0169ng x\u1EA3y ra tr\u00EAn m\u1ED9t chi\u1EBFc xe bu\u00FDt. \\n \\n \\n \\n \\n V\u00EC v\u1EADy, ch\u00EDnh ph\u1EE7 n\u00E0y th\u1EF1c s\u1EF1 \u0111ang g\u1EB7p r\u1EAFc r\u1ED1i li\u00EAn t\u1EE5c. Khi Marcos v\u1EA5p ph\u1EA3i s\u1EF1 ph\u1EA3n \u0111\u1ED1i \u0111\u1ED1i v\u1EDBi c\u00E1c ch\u00EDnh s\u00E1ch sai l\u1EA7m c\u1EE7a m\u00ECnh, s\u1EF1 \u1EE7ng h\u1ED9 c\u1EE7a c\u00F4ng ch\u00FAng \u0111\u1ED1i v\u1EDBi \u00F4ng \u0111\u00E3 gi\u1EA3m \u00EDt nh\u1EA5t 15%. \\n Nh\u1EEFng t\u00ECnh h\u00ECnh trong n\u01B0\u1EDBc n\u00E0y cho th\u1EA5y n\u0103m 2024 s\u1EBD l\u00E0 m\u1ED9t n\u0103m r\u1EA5t kh\u00F3 kh\u0103n \u0111\u1ED1i v\u1EDBi Marcos. \\nM\u1EF9 \u0111\u00E3 g\u00E2y \u00E1p l\u1EF1c l\u1EDBn, y\u00EAu c\u1EA7u Philippines ti\u1EBFp t\u1EE5c ki\u1EC1m ch\u1EBF Trung Qu\u1ED1c. Nh\u01B0ng t\u00F4i ngh\u0129 \u0111\u1EA1i \u0111a s\u1ED1 ng\u01B0\u1EDDi Philippines c\u0169ng \u0111ang l\u00E0m ph\u1EA7n vi\u1EC7c c\u1EE7a m\u00ECnh \u0111\u1EC3 c\u1ED1 g\u1EAFng t\u00E1i kh\u1EB3ng \u0111\u1ECBnh quan \u0111i\u1EC3m v\u00E0 quan \u0111i\u1EC3m th\u1EF1c s\u1EF1 c\u1EE7a ng\u01B0\u1EDDi Philippines v\u1EC1 Trung Qu\u1ED1c, m\u1ED9t n\u01B0\u1EDBc th\u00E2n thi\u1EC7n, \u1EA5m \u00E1p v\u00E0 hi\u1EC7u qu\u1EA3. \\n 03 \\n Zhu Tan: \u00D4ng t\u1EEBng ch\u1EC9 ra r\u1EB1ng qu\u00E1 tr\u00ECnh chuy\u1EC3n \u0111\u1ED5i kinh t\u1EBF v\u00E0 t\u0103ng tr\u01B0\u1EDFng b\u1EC1n v\u1EEFng c\u1EE7a Philippines kh\u00F4ng th\u1EC3 t\u00E1ch r\u1EDDi vi\u1EC7c duy tr\u00EC h\u1EE3p t\u00E1c ch\u1EB7t ch\u1EBD v\u1EDBi Trung Qu\u1ED1c v\u00E0 ASEAN. T\u1EA1i sao \u00F4ng l\u1EA1i \u0111\u01B0a ra nh\u1EADn \u0111\u1ECBnh nh\u01B0 v\u1EADy? \\n Herman Laurel: Sau khi Marcos nh\u1EADm ch\u1EE9c, ch\u00EDnh s\u00E1ch \u0111\u1ED1i ngo\u1EA1i \u0111\u1ED9c l\u1EADp v\u00E0 quan h\u1EC7 h\u1EEFu ngh\u1ECB c\u1EE7a ch\u00EDnh ph\u1EE7 Philippines v\u1EDBi Trung Qu\u1ED1c v\u00E0 c\u00E1c n\u01B0\u1EDBc l\u00E1ng gi\u1EC1ng ch\u00E2u \u00C1 kh\u00E1c \u0111\u00E3 quay \u0111\u1EA7u v\u00E0 quay sang Hoa K\u1EF3, \u0111i\u1EC1u n\u00E0y mang l\u1EA1i nhi\u1EC1u v\u1EA5n \u0111\u1EC1 cho Philippines, ti\u1EC1m n\u0103ng nghi\u00EAm tr\u1ECDng nh\u1EA5t t\u00E1c \u0111\u1ED9ng v\u1EABn c\u00F3 th\u1EC3 xu\u1EA5t hi\u1EC7n trong l\u0129nh v\u1EF1c kinh t\u1EBF. \\nR\u00F5 r\u00E0ng \u0111\u1ED9ng c\u01A1 ph\u1EE5c h\u1ED3i kinh t\u1EBF to\u00E0n c\u1EA7u sau cu\u1ED9c \u0110\u1EA1i suy tho\u00E1i k\u00E9o d\u00E0i hai n\u0103m r\u01B0\u1EE1i l\u00E0 Trung Qu\u1ED1c. \\ n Li\u00EAn minh Ch\u00E2u \u00C2u hi\u1EC7n \u0111ang trong v\u1EF1c th\u1EB3m suy tho\u00E1i. T\u1ED5ng s\u1EA3n ph\u1EA9m qu\u1ED1c n\u1ED9i (GDP) th\u1EF1c t\u1EBF c\u1EE7a Nh\u1EADt B\u1EA3n l\u1EA7n \u0111\u1EA7u ti\u00EAn t\u0103ng tr\u01B0\u1EDFng \u00E2m trong qu\u00FD 3 n\u0103m nay. Tuy nhi\u00EAn, Qu\u1EF9 Ti\u1EC1n t\u1EC7 Qu\u1ED1c t\u1EBF g\u1EA7n \u0111\u00E2y \u0111\u00E3 n\u00E2ng d\u1EF1 b\u00E1o t\u0103ng tr\u01B0\u1EDFng GDP c\u1EE7a Trung Qu\u1ED1c cho n\u0103m 2023 l\u00EAn 5,4% v\u00E0 n\u0103m 2024. T\u01B0\u01A1ng lai v\u1EABn t\u01B0\u01A1i s\u00E1ng. \\n B\u1EA1n ph\u1EA3i bi\u1EBFt r\u1EB1ng Trung Qu\u1ED1c kh\u00F4ng ch\u1EC9 l\u00E0 \u0111\u1ED1i t\u00E1c th\u01B0\u01A1ng m\u1EA1i l\u1EDBn nh\u1EA5t c\u1EE7a ASEAN m\u00E0 c\u00F2n l\u00E0 \u0111\u1ED1i t\u00E1c th\u01B0\u01A1ng m\u1EA1i l\u1EDBn nh\u1EA5t c\u1EE7a Philippines. \\n Tuy nhi\u00EAn, ch\u00EDnh quy\u1EC1n Marcos \u0111\u00E3 h\u1EE7y b\u1ECF nhi\u1EC1u d\u1EF1 \u00E1n x\u00E2y d\u1EF1ng c\u1EE7a Trung Qu\u1ED1c, m\u1EB7c d\u00F9 Trung Qu\u1ED1c kh\u00F4ng c\u00F3 b\u1EA5t k\u1EF3 t\u00E1c \u0111\u1ED9ng ti\u00EAu c\u1EF1c n\u00E0o v\u1EC1 kinh t\u1EBF \u0111\u1ED1i v\u1EDBi Philippines. \\n T\u00F4i \u0111\u00E3 nh\u1EAFc nh\u1EDF \u0111\u1ED3ng b\u00E0o m\u00ECnh r\u1EB1ng ch\u00FAng ta kh\u00F4ng \u0111\u01B0\u1EE3c qu\u00EAn b\u00E0i h\u1ECDc c\u1EE7a n\u0103m 2014. Khi \u0111\u00F3, d\u1EF1 \u0111o\u00E1n sai l\u1EA7m c\u1EE7a ch\u00EDnh ph\u1EE7 Aquino \u0111\u00E3 khi\u1EBFn s\u1ED1 chu\u1ED1i tr\u1ECB gi\u00E1 h\u00E0ng t\u1EF7 peso m\u00E0 n\u01B0\u1EDBc ta d\u1EF1 \u0111\u1ECBnh xu\u1EA5t sang Trung Qu\u1ED1c ban \u0111\u1EA7u kh\u00F4ng v\u00E0o \u0111\u01B0\u1EE3c Trung Qu\u1ED1c, c\u00E1c nh\u00E0 xu\u1EA5t kh\u1EA9u chu\u1ED1i Philippines ch\u1EC9 c\u00F3 th\u1EC3 \u0111\u1EE9ng nh\u00ECn chu\u1ED1i c\u1EE7a m\u00ECnh th\u1ED1i r\u1EEFa trong nh\u1EEFng l\u00F4 b\u1ECB t\u1EEB ch\u1ED1i trong h\u1ED3 s\u01A1. \\n V\u00EC v\u1EADy, c\u00F3 th\u1EC3 th\u1EA5y r\u1EB1ng m\u1ED9t d\u1EF1 \u0111o\u00E1n sai l\u1EA7m c\u1EE7a ch\u00EDnh ph\u1EE7 Philippines c\u00F3 th\u1EC3 d\u1EABn \u0111\u1EBFn t\u00ECnh th\u1EBF kh\u00F3 kh\u0103n m\u00E0 ng\u01B0\u1EDDi d\u00E2n Philippines b\u00ECnh th\u01B0\u1EDDng ng\u00E0y nay ph\u1EA3i \u0111\u1ED1i m\u1EB7t. Ch\u00FAng t\u00F4i li\u00EAn t\u1EE5c nh\u1EAFc nh\u1EDF ng\u01B0\u1EDDi d\u00E2n trong n\u01B0\u1EDBc r\u1EB1ng ch\u00FAng t\u00F4i kh\u00F4ng mu\u1ED1n nh\u1EEFng t\u00ECnh hu\u1ED1ng n\u00E0y x\u1EA3y ra l\u1EA7n n\u1EEFa. \\nC\u00F3 l\u1EBD ch\u1EC9 c\u00F3 Trung Qu\u1ED1c m\u1EDBi th\u1EF1c s\u1EF1 c\u00F3 th\u1EC3 th\u00FAc \u0111\u1EA9y s\u1EF1 ph\u00E1t tri\u1EC3n kinh t\u1EBF m\u00E0 Philippines c\u1EA7n trong nh\u1EEFng n\u0103m t\u1EDBi. L\u1EA5y n\u0103m t\u1EDBi l\u00E0m v\u00ED d\u1EE5, Hoa K\u1EF3 c\u00F3 th\u1EC3 kh\u00F4ng c\u00F3 \u0111\u1EE7 ngu\u1ED3n l\u1EF1c t\u00E0i ch\u00EDnh \u0111\u1EC3 h\u1ED7 tr\u1EE3 Philippines, Nh\u1EADt B\u1EA3n c\u00F3 th\u1EC3 kh\u00F4ng th\u1EC3 cung c\u1EA5p h\u1ED7 tr\u1EE3 do suy tho\u00E1i kinh t\u1EBF, Ng\u00E2n h\u00E0ng Ph\u00E1t tri\u1EC3n Ch\u00E2u \u00C1 v\u1EABn c\u00F2n m\u1ED9t s\u1ED1 d\u1EF1 \u00E1n c\u0169 c\u1EA7n \u0111\u01B0\u1EE3c ho\u00E0n thi\u1EC7n. ti\u1EBFp t\u1EE5c v\u00E0 c\u00F3 th\u1EC3 kh\u00F4ng c\u00F3 th\u1EDDi gian \u0111\u1EC3 quan t\u00E2m \u0111\u1EBFn Philippines. \\n Trong nh\u1EEFng th\u00E1ng t\u1EDBi, n\u1EBFu ch\u00FAng ta c\u00F3 th\u1EC3 ng\u0103n ch\u1EB7n th\u00E0nh c\u00F4ng vi\u1EC7c Hoa K\u1EF3 ph\u00E1 ho\u1EA1i m\u1ED1i quan h\u1EC7 t\u1ED1t \u0111\u1EB9p c\u1EE7a Philippines v\u1EDBi Trung Qu\u1ED1c, ch\u00FAng ta c\u00F3 th\u1EC3 ti\u1EBFp t\u1EE5c h\u01B0\u1EDBng t\u1EDBi m\u1ED9t t\u01B0\u01A1ng lai t\u1ED1t \u0111\u1EB9p h\u01A1n. \\n V\u00EC v\u1EADy, ch\u00FAng t\u00F4i hy v\u1ECDng r\u1EB1ng v\u00E0o n\u0103m 2024, Philippines c\u00F3 th\u1EC3 xoay chuy\u1EC3n t\u00ECnh th\u1EBF. \\n \\n \\n [Bi\u00EAn t\u1EADp vi\u00EAn: T\u1EC1 L\u00F4i]"
205
+ # message = "G\u1EA7n \u0111\u00E2y, Philippines \u0111\u00E3 nhi\u1EC1u l\u1EA7n g\u00E2y ra r\u1EAFc r\u1ED1i \u1EDF Bi\u1EC3n \u0110\u00F4ng, x\u00E2m nh\u1EADp \u0110\u00E1 Nh\u00E2n \u00C1i thu\u1ED9c qu\u1EA7n \u0111\u1EA3o Nam Sa c\u1EE7a Trung Qu\u1ED1c, \u0111\u1ED3ng th\u1EDDi ti\u1EBFp t\u1EE5c ph\u00F3ng \u0111\u1EA1i v\u00E0 c\u01B0\u1EDDng \u0111i\u1EC7u h\u00F3a, nh\u1EA7m l\u1EABn \u0111\u00FAng sai v\u00E0 \u0111\u01B0a ra nh\u1EEFng c\u00E1o bu\u1ED9c v\u00F4 l\u00FD ch\u1ED1ng l\u1EA1i Trung Qu\u1ED1c. \\n Ng\u01B0\u1EDDi Philippines ngh\u0129 g\u00EC v\u1EC1 h\u00E0nh \u0111\u1ED9ng c\u1EE7a ch\u00EDnh ph\u1EE7 Philippines? \\ n \u00D4ng Tan \u0111\u00E3 t\u00ECm ra Herman Laurel, m\u1ED9t nh\u00E0 b\u00ECnh lu\u1EADn ch\u00EDnh tr\u1ECB n\u1ED5i ti\u1EBFng \u1EDF Philippines, \u0111\u1ED3ng th\u1EDDi l\u00E0 gi\u00E1m \u0111\u1ED1c Vi\u1EC7n nghi\u00EAn c\u1EE9u chi\u1EBFn l\u01B0\u1EE3c \"Th\u1EBF k\u1EF7 ch\u00E2u \u00C1\" c\u1EE7a Philippines v\u00E0 l\u00E0 ng\u01B0\u1EDDi s\u00E1ng l\u1EADp Hi\u1EC7p h\u1ED9i nghi\u00EAn c\u1EE9u ch\u00EDnh s\u00E1ch BRICS c\u1EE7a Philippines. Herman Laurel, ng\u01B0\u1EDDi \u0111ang nghi\u00EAn c\u1EE9u v\u1EC1 ch\u00E2u \u00C1, \u0111\u1EB7c bi\u1EC7t l\u00E0 m\u1ED1i quan h\u1EC7 gi\u1EEFa Trung Qu\u1ED1c v\u00E0 ASEAN, chia s\u1EBB quan s\u00E1t c\u1EE7a m\u00ECnh. \\n 01 \\n Tan Zhu: K\u1EC3 t\u1EEB n\u1EEDa cu\u1ED1i n\u0103m nay, Philippines th\u01B0\u1EDDng xuy\u00EAn x\u00E2m chi\u1EBFm c\u00E1c \u0111\u1EA3o v\u00E0 b\u00E3i \u0111\u00E1 c\u1EE7a ch\u00FAng t\u00F4i \u1EDF Bi\u1EC3n \u0110\u00F4ng, l\u00E0m gia t\u0103ng c\u0103ng th\u1EB3ng \u1EDF Bi\u1EC3n \u0110\u00F4ng, \u0111\u1ED3ng th\u1EDDi vu kh\u1ED1ng Trung Qu\u1ED1c l\u00E0 k\u1EBB b\u1EAFt n\u1EA1t l\u1EDBn. , c\u00F3 nh\u1EEFng nh\u00F3m \u1EDF Philippines ph\u1EA3n \u0111\u1ED1i h\u00E0nh \u0111\u1ED9ng khi\u00EAu kh\u00EDch c\u1EE7a Philippines. B\u1EA1n c\u1EA3m th\u1EA5y th\u1EBF n\u00E0o v\u1EC1 \u0111i\u1EC1u n\u00E0y v\u1EDBi t\u01B0 c\u00E1ch l\u00E0 m\u1ED9t ng\u01B0\u1EDDi Philippines? Herman Laurel: T\u00F4i cho r\u1EB1ng C\u1EA3nh s\u00E1t bi\u1EC3n Philippines v\u00E0 m\u1ED9t s\u1ED1 t\u00E0u d\u00E2n s\u1EF1 Philippines \u0111\u00E3 c\u1ED1 t\u00ECnh g\u00E2y r\u1EAFc r\u1ED1i v\u00EC h\u1ECD \u0111ang th\u1EF1c hi\u1EC7n \u201CD\u1EF1 \u00E1n t\u1ED5ng th\u1EC3\u201D c\u1EE7a Hoa K\u1EF3. \\nM\u1EF9 khoe khoang d\u1EF1 \u00E1n n\u00E0y v\u1EDBi ch\u00FAng t\u00F4i n\u00EAn ch\u00FAng t\u00F4i bi\u1EBFt t\u00EAn d\u1EF1 \u00E1n. D\u1EF1 \u00E1n \u0111\u01B0\u1EE3c d\u1EABn d\u1EAFt b\u1EDFi \u0110\u1EA1i t\u00E1 Kh\u00F4ng qu\u00E2n Hoa K\u1EF3 \u0111\u00E3 ngh\u1EC9 h\u01B0u Raimundo Powell, l\u00E0m vi\u1EC7c v\u1EDBi c\u00E1c \u0111\u1ED1i t\u00E1c Philippines nh\u01B0 C\u1EA3nh s\u00E1t bi\u1EC3n Philippines v\u00E0 m\u1ED9t s\u1ED1 quan ch\u1EE9c c\u1EE7a B\u1ED9 Ngo\u1EA1i giao Philippines"
206
+ # qs = "Tóm tắt nội dung liên quan đến Biển Đông và Việt Nam"
207
+ # lg = "vi"
208
+ # ver = "en-llama"
209
+ # ans = summary_with_llama(qs, message, lg, version=ver, max_word_per_context=1024)
210
+ # print(ans)
function/tc_v2.py ADDED
@@ -0,0 +1,573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tensorRT import inference
2
+ import re
3
+ from collections import Counter
4
+ from vncorenlp import VnCoreNLP
5
+ from nltk.tokenize import sent_tokenize
6
+ import torch
7
+ import datetime
8
+ from sklearn.cluster import AgglomerativeClustering
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ import numpy as np
11
+ import json
12
+ from . import utils
13
+ import time
14
+ from summary import text_summary, get_summary_bert
15
+ from function.clean_text import normalize_text
16
+ from .summary_with_llm import summary_with_llama
17
+ from .translate import translate_text_multi_layer
18
+ from scipy.spatial import distance
19
+ import copy
20
+ from .sentence_embbeding import embbeded_zh, embbeded_en, embedded_bge
21
+
22
+
23
+ # from . import detect_time as dt
24
+
25
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
26
+ use_cuda = torch.cuda.is_available()
27
+ print(torch.cuda.is_available())
28
+
29
+ # annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx2g')
30
+
31
+
32
+ def detect_postaging(text_in):
33
+ word_segmented_text = annotator.annotate(text_in)
34
+ lst_k = []
35
+ for se in word_segmented_text["sentences"]:
36
+ for kw in se:
37
+ if kw["posTag"] in ("Np", "Ny", "N"):
38
+ if kw["posTag"] == "N" and "_" not in kw["form"]:
39
+ continue
40
+ lst_k.append(kw["form"].replace("_", " "))
41
+ return list(set(lst_k))
42
+
43
+ def clean_text(text_in):
44
+ doc = re.sub('<.*?>', '', text_in)
45
+ doc = re.sub('(function).*}', ' ', doc)
46
+ # link
47
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
48
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
49
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
50
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
51
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
52
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
53
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
54
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
55
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
56
+
57
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
58
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
59
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
60
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
61
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
62
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
63
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
64
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
65
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
66
+ # escape sequence
67
+ doc = re.sub('\n', ' ', doc)
68
+ doc = re.sub('\t', ' ', doc)
69
+ doc = re.sub('\r', ' ', doc)
70
+
71
+ doc = normalize_text(doc)
72
+ return doc
73
+
74
+
75
+ def data_cleaning(docs):
76
+ res = []
77
+ for d in docs:
78
+ if 'message' in d:
79
+ # css and js
80
+ doc = re.sub('<.*?>', '', d['message'])
81
+ doc = re.sub('(function).*}', ' ', doc)
82
+
83
+ # link
84
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
85
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
86
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
87
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
88
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
89
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
90
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
91
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
92
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
93
+
94
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
95
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
96
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
97
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
98
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
99
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
100
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
101
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
102
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
103
+ # escape sequence
104
+ doc = re.sub('\n', ' ', doc)
105
+ doc = re.sub('\t', ' ', doc)
106
+ doc = re.sub('\r', ' ', doc)
107
+
108
+ d['message'] = doc
109
+ res.append(d)
110
+ return res
111
+
112
+
113
+ def segment(docs, lang="vi"):
114
+ segmented_docs = []
115
+ for d in docs:
116
+ # print(d)
117
+ # if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
118
+ if len(d.get('message', "")) > 8000:
119
+ continue
120
+ if 'snippet' not in d:
121
+ continue
122
+ try:
123
+ if lang == "vi":
124
+ snippet = d.get('snippet', "")
125
+ segmented_snippet = ""
126
+ segmented_sentences_snippet = annotator.tokenize(snippet)
127
+ for sentence in segmented_sentences_snippet:
128
+ segmented_snippet += ' ' + ' '.join(sentence)
129
+ segmented_snippet = segmented_snippet.replace('\xa0', '')
130
+ d['segmented_snippet'] = segmented_snippet
131
+ segmented_docs.append(d)
132
+ except Exception:
133
+ pass
134
+ return segmented_docs
135
+
136
+
137
+ def timestamp_to_date(timestamp):
138
+ return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')
139
+
140
+
141
+ def re_ranking(result_topic, vectors_prompt, sorted_field):
142
+ lst_score = []
143
+ lst_ids = []
144
+ lst_top = []
145
+ try:
146
+ for k in result_topic:
147
+ lst_ids.append(k)
148
+ if not sorted_field.strip():
149
+ lst_top.append(len(result_topic[k]))
150
+ else:
151
+ lst_top.append(result_topic[k][0]['max_score'])
152
+ vector_center = result_topic[k][0]["vector"]
153
+ max_score = 11.0
154
+ for vec in vectors_prompt:
155
+ score = distance.cosine(np.array(vec), np.array(vector_center))
156
+ if score < max_score:
157
+ max_score = score
158
+ lst_score.append(max_score)
159
+ result_topic[k][0]["similarity_score"] = max_score
160
+ for d in result_topic[k]:
161
+ d["similarity_score"] = max_score
162
+ del result_topic[k][0]["vector"]
163
+ idx = np.argsort(np.array(lst_score))
164
+ except Exception as ve:
165
+ return [], lst_ids, lst_top
166
+ return idx, lst_ids, lst_top
167
+
168
+ def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster = 50, delete_message=True, prompt="", hash_str: str= "", vectors_prompt: list = []):
169
+ print(f'[INFO] sorted_field: {sorted_field}')
170
+ MAX_DOC_PER_CLUSTER = max_doc_per_cluster
171
+
172
+ lst_ids = []
173
+ lst_top = []
174
+ lst_res = []
175
+ idx = []
176
+ if prompt:
177
+ idx, lst_ids, lst_top = re_ranking(response, vectors_prompt, sorted_field)
178
+ print("idx_prompt: ", idx)
179
+ if len(prompt) == 0 or len(idx) == 0:
180
+ for i in response:
181
+ lst_ids.append(i)
182
+ if not sorted_field.strip():
183
+ lst_top.append(len(response[i]))
184
+ else:
185
+ lst_top.append(response[i][0]['max_score'])
186
+ idx = np.argsort(np.array(lst_top))[::-1]
187
+ print("idx_not_prompt: ", idx)
188
+ if top_cluster == -1:
189
+ top_cluster = len(idx)
190
+ for i in idx[: top_cluster]:
191
+ ik = lst_ids[i]
192
+ if top_sentence == -1:
193
+ top_sentence = len(response[ik])
194
+ lst_check_title = []
195
+ lst_check_not_title = []
196
+ i_c_t = 0
197
+ for resss in response[ik]:
198
+ r_title = resss.get("title", "")
199
+ if r_title and not r_title.endswith("..."):
200
+ lst_check_title.append(resss)
201
+ i_c_t += 1
202
+ else:
203
+ lst_check_not_title.append(resss)
204
+ if i_c_t == top_sentence:
205
+ break
206
+ if i_c_t == top_sentence:
207
+ lst_res.append(lst_check_title)
208
+ else:
209
+ lst_check_title.extend(lst_check_not_title)
210
+ lst_res.append(lst_check_title[:top_sentence])
211
+ #lst_res.append(response[ik][:top_sentence])
212
+ dict_res = {}
213
+ for i in range(len(lst_res)):
214
+ dict_res[str(i + 1)] = lst_res[i][:MAX_DOC_PER_CLUSTER]
215
+ for j in range(min(len(dict_res[str(i + 1)]), 3)):
216
+ dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
217
+ # t11 = time.time()
218
+ summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary, title=dict_res[str(i + 1)][0].get("title", ""), snippet=dict_res[str(i + 1)][0].get("snippet", ""))
219
+ # print("time_summary: ", time.time() - t11)
220
+ if len(summary_text) < 10:
221
+ summary_text = dict_res[str(i + 1)][0].get("snippet", "")
222
+ if len(summary_text) < 10:
223
+ summary_text = dict_res[str(i + 1)][0].get("title", "")
224
+ summary_text = utils.remove_image_keyword(summary_text)
225
+ # if prompt:
226
+ # if dict_res[str(i + 1)][0].get("message", ""):
227
+ # src_lang = dict_res[str(i + 1)][0].get("lang", "")
228
+ # print("src_lang: ", src_lang)
229
+ # print("summary_text: ", summary_text)
230
+ # summary_text = translate_text_multi_layer(src_lang, "vi", summary_text)
231
+ # text_tran = translate_text_multi_layer(src_lang, "vi", dict_res[str(i + 1)][0].get("message", ""))
232
+ # ans_from_llama = summary_with_llama(prompt, text_tran, "vi", version="vi-llama", max_word_per_context=1000)
233
+ # print("ans_from_llama: ", ans_from_llama)
234
+ # summary_text = summary_text + "$$$$\n" + ans_from_llama
235
+ # print("summary_text: ", summary_text, len(summary_text))
236
+ dict_res[str(i + 1)][0]["content_summary"] = summary_text
237
+ dict_res[str(i + 1)][0]["num_of_post"] = len(lst_res[i])
238
+ kew_phares = []
239
+ dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
240
+
241
+ # print("delete_message: ", delete_message)
242
+ if delete_message:
243
+ for j in range(len(dict_res[str(i + 1)])):
244
+ if "message" in dict_res[str(i + 1)][j]:
245
+ del dict_res[str(i + 1)][j]["message"]
246
+
247
+ with open(f"log_llm/topic_result_after_postprocessing/{hash_str}.json", "w") as f:
248
+ dict_log_pos = {}
249
+ for k in dict_res:
250
+ dict_log_pos[k] = copy.deepcopy(dict_res[k])
251
+ for d in dict_log_pos[k]:
252
+ if "message" in d:
253
+ del d["message"]
254
+ if "vector" in d:
255
+ del d["vector"]
256
+ json.dump(dict_log_pos, f, ensure_ascii= False)
257
+ return dict_res
258
+
259
+
260
+ def get_lang(docs):
261
+ lang_vi = 0
262
+ lang_en = 0
263
+ dict_lang = {}
264
+ for d in docs:
265
+ lang = d.get("lang", "")
266
+ if lang not in dict_lang:
267
+ dict_lang[lang] = 0
268
+ dict_lang[lang] += 1
269
+ # if d.get("lang", "") == "vi":
270
+ # lang_vi += 1
271
+ # else:
272
+ # lang_en += 1
273
+ lst_lang = []
274
+ lst_cnt = []
275
+ for k in dict_lang:
276
+ lst_lang.append(k)
277
+ lst_cnt.append(dict_lang[k])
278
+ idx_max = np.argsort(np.array(lst_cnt))[::-1][0]
279
+ lang = lst_lang[int(idx_max)]
280
+
281
+ if lang.startswith("zh_"):
282
+ lang = "zh"
283
+ print("lang: ", lang, lst_cnt[int(idx_max)])
284
+ return lang
285
+
286
+
287
+ def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50,
288
+ delete_message=True, prompt="", type_cluster:str = "single", hash_str: str= "", id_topic=""):
289
+ # global model, model_en
290
+ with open("data/topic_name.txt") as f:
291
+ dict_topic_name = json.load(f)
292
+ topic_name_relevant = dict_topic_name.get(id_topic , "")
293
+ docs = docs[:30000]
294
+ lang = get_lang(docs)
295
+ if type_cluster == "complete" and lang == "zh":
296
+ distance_threshold = 0.4
297
+ if type_cluster == "complete" and lang == "en":
298
+ distance_threshold = 0.4
299
+ # type_cluster = "single"
300
+
301
+ result = {}
302
+ cluster_score = {}
303
+ cluster_real_vectors = {}
304
+ # docs = segment(docs, lang=lang)
305
+
306
+ t1 = time.time()
307
+ if len(docs) < 1:
308
+ return result
309
+ elif len(docs) == 1:
310
+ return {
311
+ "0": docs
312
+ }
313
+ vec_prompt = []
314
+ prompt_strips = []
315
+ # prompt = ""
316
+ if topic_name_relevant:
317
+ prompt_split = topic_name_relevant.split("#####")
318
+ for prom in prompt_split:
319
+ sys_p = prom.strip().split("$$$$")
320
+ if len(sys_p) == 1:
321
+ prompt_strips.append(prom.strip())
322
+ else:
323
+ prompt_strips.append(sys_p[1].strip())
324
+ if lang == "zh":
325
+ vec_prompt = embbeded_zh(prompt_split)
326
+ elif lang == "en":
327
+ vec_prompt = embbeded_en(prompt_split)
328
+ else:
329
+ vec_prompt = inference.encode(prompt_split, lang=lang)
330
+ if lang == "zh":
331
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
332
+ vectors = embbeded_zh(features)
333
+ # vectors = embedded_bge(features)
334
+ if len(vectors) == 0:
335
+ print(f"[WARNING] Embedded {lang}: {len(vectors)} / {len(features)}")
336
+ vectors = inference.encode(features, lang=lang)
337
+ # vectors = model.encode(features, show_progress_bar=False)
338
+ elif lang == "en":
339
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
340
+ vectors = embbeded_en(features)
341
+ # vectors = embedded_bge(features)
342
+ if len(vectors) == 0:
343
+ print(f"[WARNING] Embedded {lang}: {len(vectors)} / {len(features)}")
344
+ vectors = inference.encode(features, lang=lang)
345
+ else:
346
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
347
+ # vectors = embedded_bge(features)
348
+ # if len(vectors) == 0:
349
+ # print(f"[WARNING] Embedded {lang}: {len(vectors)} / {len(features)}")
350
+ vectors = inference.encode(features, lang=lang)
351
+ # vectors = model_en.encode(features, show_progress_bar=False)
352
+ clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
353
+ linkage=type_cluster, distance_threshold=distance_threshold)
354
+ clusteror.fit(vectors)
355
+ matrix_vec = np.stack(vectors, axis=0)
356
+ print(f"Time encode + clustering: {time.time() - t1} {clusteror.n_clusters_}")
357
+ for i in range(clusteror.n_clusters_):
358
+ result[str(i + 1)] = []
359
+ cluster_score[str(i + 1)] = 0
360
+ ids = clusteror.labels_ # == i
361
+ # cluster_real_vectors[str(i + 1)] = re_clustering(ids, matrix_vec, distance_threshold, max_doc_per_cluster)
362
+
363
+ for i in range(len(clusteror.labels_)):
364
+ cluster_no = clusteror.labels_[i]
365
+ # if any((cluster_real_vectors[str(cluster_no+1)][:] == vectors[i]).all(1)):
366
+ if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
367
+ response_doc = {}
368
+ response_doc = docs[i]
369
+ score = response_doc.get('score', 0)
370
+ if not docs[i].get('message','').strip():
371
+ continue
372
+ if score > cluster_score[str(cluster_no + 1)]:
373
+ cluster_score[str(cluster_no + 1)] = score
374
+ if 'domain' in docs[i]:
375
+ response_doc['domain'] = docs[i]['domain']
376
+ if 'url' in docs[i]:
377
+ response_doc['url'] = docs[i]['url']
378
+ if 'title' in docs[i]:
379
+ response_doc['title'] = clean_text(docs[i]['title'])
380
+ if 'snippet' in docs[i]:
381
+ response_doc['snippet'] = clean_text(docs[i]['snippet'])
382
+ if 'created_time' in docs[i]:
383
+ response_doc['created_time'] = docs[i]['created_time']
384
+ if "sentiment" in docs[i]:
385
+ response_doc['sentiment'] = docs[i]['sentiment']
386
+ if 'message' in docs[i]:
387
+ title = docs[i].get('title','')
388
+ snippet = docs[i].get('snippet','')
389
+ message = docs[i].get('message','')
390
+ if title.strip():
391
+ split_mess = message.split(title)
392
+ if len(split_mess) > 1:
393
+ message = title.join(split_mess[1:])
394
+ if snippet.strip():
395
+ split_mess = message.split(snippet)
396
+ if len(split_mess) > 1:
397
+ message = snippet.join(split_mess[1:])
398
+
399
+ response_doc['message'] = clean_text(message)
400
+ if 'id' in docs[i]:
401
+ response_doc['id'] = docs[i]['id']
402
+ # response_doc['score'] = 0.0
403
+ response_doc['title_summarize'] = []
404
+ response_doc['content_summary'] = ""
405
+ response_doc['total_facebook_viral'] = 0
406
+ response_doc["vector"] = np.array(vectors[i]).tolist()
407
+ result[str(cluster_no + 1)].append(response_doc)
408
+ empty_clus_ids = []
409
+ for x in result:
410
+ result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
411
+ if len( result[x]) > 0:
412
+ if len(result[x]) > 1:
413
+ result[x] = check_duplicate_title_domain(result[x])
414
+ result[x][0]['num_docs'] = len(result[x])
415
+ result[x][0]['max_score'] = cluster_score[x]
416
+ else:
417
+ empty_clus_ids.append(x)
418
+
419
+ for x in empty_clus_ids:
420
+ result.pop(x,None)
421
+ # result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
422
+ with open(f"log_llm/topic_result_before_postprocessing/{hash_str}.json", "w") as f:
423
+ dict_log = {}
424
+ for k in result:
425
+ dict_log[k] = copy.deepcopy(result[k])
426
+ for d in dict_log[k]:
427
+ if "message" in d:
428
+ del d["message"]
429
+ if "vector" in d:
430
+ del d["vector"]
431
+ json.dump(dict_log, f, ensure_ascii= False)
432
+ return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=delete_message,
433
+ prompt=topic_name_relevant, hash_str=hash_str, vectors_prompt=vec_prompt)
434
+
435
+ def check_duplicate_title_domain(docs):
436
+ lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
437
+ for i in range(1,len(lst_title_domain) -1):
438
+ for j in range(i+1,len(lst_title_domain)):
439
+ if lst_title_domain[j] == lst_title_domain[i]:
440
+ lst_title_domain[j] = 'dup'
441
+ lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
442
+ return lst_filter_docs
443
+ def convert_date(text):
444
+ text = text.replace(".", "/")
445
+ text = text.replace("-", "/")
446
+ return text
447
+
448
+
449
+ def check_keyword(sentence):
450
+ keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
451
+ for k in keyword:
452
+ if k in sentence:
453
+ return True
454
+ return False
455
+
456
+
457
+ def extract_events_and_time(docs, publish_date):
458
+ def standardize(date_str):
459
+ return date_str.replace('.', '/').replace('-', '/')
460
+
461
+ def add_0(date_str):
462
+
463
+ date_str = date_str.split('/')
464
+ res = []
465
+ for o in date_str:
466
+ o = re.sub('\s+', '', o)
467
+ if len(o) < 2:
468
+ o = '0' + o
469
+ res.append(o)
470
+ date_str = '/'.join(res)
471
+ return date_str
472
+
473
+ def get_date_list(reg, sentence):
474
+ find_object = re.finditer(reg, sentence)
475
+ date_list = [x.group() for x in find_object]
476
+ return date_list
477
+
478
+ year = publish_date.split('/')[2]
479
+
480
+ # dd/mm/yyyy
481
+ reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
482
+ # #mm/yyyy
483
+ # reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
484
+ # dd/mm
485
+ reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'
486
+
487
+ # ngày dd tháng mm năm yyyy
488
+ reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
489
+ # ngày dd tháng mm
490
+ reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'
491
+
492
+ result = []
493
+ for d in docs:
494
+ text = d['message']
495
+ for sentence in sent_tokenize(text):
496
+ lower_sentence = sentence.lower()
497
+ c = re.search(reg_exp_3, sentence.lower())
498
+ d = re.search(reg_exp_4, sentence.lower())
499
+ # e = re.search(reg_exp_5, sentence.lower())
500
+ a = re.search(reg_exp_1, sentence)
501
+ b = re.search(reg_exp_2, sentence)
502
+ #
503
+ if (a or b or c or d) and check_keyword(lower_sentence):
504
+ date_list = get_date_list(reg_exp_1, lower_sentence)
505
+ date_entity = ''
506
+ if date_list:
507
+ date_entity = add_0(standardize(date_list[0]))
508
+ elif get_date_list(reg_exp_2, lower_sentence):
509
+ date_list = get_date_list(reg_exp_2, lower_sentence)
510
+ date_entity = add_0(standardize(date_list[0]) + '/' + year)
511
+ elif get_date_list(reg_exp_3, lower_sentence):
512
+ date_list = get_date_list(reg_exp_3, lower_sentence)
513
+
514
+ date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
515
+ date_entity = re.sub('\s+', ' ', date_entity)
516
+ date_entity = date_entity.replace(' ', '/')
517
+ date_entity = add_0(date_entity)
518
+ else:
519
+ date_list = get_date_list(reg_exp_4, lower_sentence)
520
+ if date_list != []:
521
+ date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
522
+ date_entity = re.sub('\s+', ' ', date_entity)
523
+ date_entity = date_entity.replace(' ', '/')
524
+ date_entity = date_entity + '/' + year
525
+ date_entity = add_0(date_entity)
526
+ result.append((sentence, date_entity))
527
+ return result
528
+
529
+ def find_index_nearest_vector(cluster, vectors):
530
+ # Compute the centroid of the cluster
531
+ centroid = np.mean(cluster, axis=0, keepdims=True)
532
+
533
+ # Calculate the Euclidean distance between each vector and the centroid
534
+ distances = cosine_similarity(centroid, vectors)
535
+
536
+ # Find the index of the vector with the minimum distance
537
+ nearest_index = np.argmin(distances, axis=1)
538
+
539
+
540
+ return nearest_index
541
+
542
+ def re_clustering(ids, vectors, distance_threshold, max_doc_per_cluster):
543
+ sub_vectors = vectors[ids]
544
+
545
+ try:
546
+ if sub_vectors.shape[0] < 2:
547
+ return sub_vectors
548
+ sub_clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
549
+ linkage='complete', distance_threshold=0.12)
550
+ sub_clusteror.fit(sub_vectors)
551
+ dict_cluster = {id_clus: sub_vectors[sub_clusteror.labels_ == id_clus] for id_clus in range(sub_clusteror.n_clusters_)}
552
+ dict_num_vec = {id_clus: v.shape[0] for id_clus, v in dict_cluster.items()}
553
+
554
+ max_num_cluster = max(dict_num_vec, key=dict_num_vec.get)
555
+ other_vectors = sub_vectors[sub_clusteror.labels_ != max_num_cluster]
556
+
557
+ # if other_vectors.shape[0]:
558
+ # while dict_num_vec[max_num_cluster] < max_doc_per_cluster:
559
+ # tmp_index_vec = find_index_nearest_vector(dict_cluster[max_num_cluster], other_vectors)
560
+ # dict_cluster[max_num_cluster] = np.vstack((dict_cluster[max_num_cluster], other_vectors[tmp_index_vec]))
561
+ # dict_num_vec[max_num_cluster] += 1
562
+ # if other_vectors.shape[0] != 1:
563
+ # other_vectors = np.delete(other_vectors, tmp_index_vec, axis=0)
564
+ # else:
565
+ # break
566
+ cosine_scores = cosine_similarity(dict_cluster[max_num_cluster], dict_cluster[max_num_cluster])
567
+ with open("/home/vietle/topic-clustering/log_score.txt", "a") as f:
568
+ f.write(str(cosine_scores) + "\n")
569
+ return dict_cluster[max_num_cluster]
570
+ except Exception as e:
571
+ with open("/home/vietle/topic-clustering/log_clustering_diemtin/log_cluster_second.txt", "a") as f:
572
+ f.write(str(e)+"$$"+json.dumps({"ids": ids.tolist(), "vectors": vectors.tolist()}))
573
+ return sub_vectors
function/topic_clustering.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from email import message
2
+ import re
3
+ from vncorenlp import VnCoreNLP
4
+ from nltk.tokenize import sent_tokenize
5
+ import torch
6
+ from sentence_transformers import SentenceTransformer
7
+ import datetime
8
+ from sklearn.cluster import AgglomerativeClustering
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ import numpy as np
11
+ import requests
12
+ import json
13
+ from . import utils
14
+ import time
15
+ from summary import text_summary, get_summary_bert
16
+ from function.clean_text import normalize_text
17
+ # from . import detect_time as dt
18
+
19
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20
+ model = SentenceTransformer('model/distiluse-base-multilingual-cased-v2').to(device)
21
+ # model = SentenceTransformer('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base').to(device)
22
+ # model.save('model/distiluse-base-multilingual-cased-v2')
23
+
24
+ use_cuda = torch.cuda.is_available()
25
+ print(torch.cuda.is_available())
26
+ if torch.cuda.is_available():
27
+ model_en = SentenceTransformer('model/paraphrase-mpnet-base-v2').to(device)
28
+ else:
29
+ model_en = model
30
+ # model_en.save('model/paraphrase-mpnet-base-v2')
31
+ annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx8g')
32
+
33
+
34
+ def detect_postaging(text_in):
35
+ word_segmented_text = annotator.annotate(text_in)
36
+ lst_k = []
37
+ for se in word_segmented_text["sentences"]:
38
+ for kw in se:
39
+ if kw["posTag"] in ("Np", "Ny", "N"):
40
+ if kw["posTag"] == "N" and "_" not in kw["form"]:
41
+ continue
42
+ lst_k.append(kw["form"].replace("_", " "))
43
+ return list(set(lst_k))
44
+
45
+ def clean_text(text_in):
46
+ doc = re.sub('<.*?>', '', text_in)
47
+ doc = re.sub('(function).*}', ' ', doc)
48
+ # link
49
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
50
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
51
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
52
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
53
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
54
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
55
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
56
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
57
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
58
+
59
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
60
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
61
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
62
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
63
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
64
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
65
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
66
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
67
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
68
+ # escape sequence
69
+ doc = re.sub('\n', ' ', doc)
70
+ doc = re.sub('\t', ' ', doc)
71
+ doc = re.sub('\r', ' ', doc)
72
+
73
+ doc = normalize_text(doc)
74
+ return doc
75
+
76
+
77
+ def data_cleaning(docs):
78
+ res = []
79
+ for d in docs:
80
+ if 'message' in d:
81
+ # css and js
82
+ doc = re.sub('<.*?>', '', d['message'])
83
+ doc = re.sub('(function).*}', ' ', doc)
84
+
85
+ # link
86
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
87
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
88
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
89
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
90
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
91
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
92
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
93
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
94
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
95
+
96
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
97
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
98
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
99
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
100
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
101
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
102
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
103
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
104
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
105
+ # escape sequence
106
+ doc = re.sub('\n', ' ', doc)
107
+ doc = re.sub('\t', ' ', doc)
108
+ doc = re.sub('\r', ' ', doc)
109
+
110
+ d['message'] = doc
111
+ res.append(d)
112
+ return res
113
+
114
+
115
+ def segment(docs, lang="vi"):
116
+ segmented_docs = []
117
+ for d in docs:
118
+ # if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
119
+ if len(d.get('message', "")) > 8000:
120
+ continue
121
+ if 'snippet' not in d:
122
+ continue
123
+ try:
124
+ if lang == "vi":
125
+ snippet = d.get('snippet', "")
126
+ segmented_snippet = ""
127
+ segmented_sentences_snippet = annotator.tokenize(snippet)
128
+ for sentence in segmented_sentences_snippet:
129
+ segmented_snippet += ' ' + ' '.join(sentence)
130
+ segmented_snippet = segmented_snippet.replace('\xa0', '')
131
+ d['segmented_snippet'] = segmented_snippet
132
+ segmented_docs.append(d)
133
+ except Exception:
134
+ pass
135
+ return segmented_docs
136
+
137
+
138
+ def timestamp_to_date(timestamp):
139
+ return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')
140
+
141
+
142
+ def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster = 50):
143
+ print(f'[INFO] sorted_field: {sorted_field}')
144
+ MAX_DOC_PER_CLUSTER = max_doc_per_cluster
145
+
146
+ lst_ids = []
147
+ lst_top = []
148
+ lst_res = []
149
+ for i in response:
150
+ lst_ids.append(i)
151
+
152
+ if not sorted_field.strip():
153
+ lst_top.append(len(response[i]))
154
+ else:
155
+ lst_top.append(response[i][0]['max_score'])
156
+
157
+ idx = np.argsort(np.array(lst_top))[::-1]
158
+ if top_cluster == -1:
159
+ top_cluster = len(idx)
160
+ for i in idx[: top_cluster]:
161
+ ik = lst_ids[i]
162
+ if top_sentence == -1:
163
+ top_sentence = len(response[ik])
164
+ lst_check_title = []
165
+ lst_check_not_title = []
166
+ i_c_t = 0
167
+ for resss in response[ik]:
168
+ r_title = resss.get("title", "")
169
+ if r_title and not r_title.endswith("..."):
170
+ lst_check_title.append(resss)
171
+ i_c_t += 1
172
+ else:
173
+ lst_check_not_title.append(resss)
174
+ if i_c_t == top_sentence:
175
+ break
176
+ if i_c_t == top_sentence:
177
+ lst_res.append(lst_check_title)
178
+ else:
179
+ lst_check_title.extend(lst_check_not_title)
180
+ lst_res.append(lst_check_title[:top_sentence])
181
+ #lst_res.append(response[ik][:top_sentence])
182
+ dict_res = {}
183
+ for i in range(len(lst_res)):
184
+ dict_res[str(i + 1)] = lst_res[i][:MAX_DOC_PER_CLUSTER]
185
+ for j in range(min(len(dict_res[str(i + 1)]), 3)):
186
+ dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
187
+ summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), lang=dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary, title=dict_res[str(i + 1)][0].get("title", ""), snippet=dict_res[str(i + 1)][0].get("snippet", ""))
188
+ if len(summary_text) < 10:
189
+ summary_text = dict_res[str(i + 1)][0].get("snippet", "")
190
+ if len(summary_text) < 10:
191
+ summary_text = dict_res[str(i + 1)][0].get("title", "")
192
+ dict_res[str(i + 1)][0]["content_summary"] = utils.remove_image_keyword(summary_text)
193
+ kew_phares = []
194
+ dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
195
+ for j in range(len(dict_res[str(i + 1)])):
196
+ if "message" in dict_res[str(i + 1)][j]:
197
+ del dict_res[str(i + 1)][j]["message"]
198
+ return dict_res
199
+
200
+
201
+ def get_lang(docs):
202
+ lang_vi = 0
203
+ lang_en = 0
204
+ for d in docs:
205
+ if d.get("lang", "") == "vi":
206
+ lang_vi += 1
207
+ else:
208
+ lang_en += 1
209
+ if lang_vi >= lang_en:
210
+ return "vi"
211
+ return "en"
212
+
213
+
214
+ # def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field=''):
215
+ # global model, model_en
216
+ # docs = docs[:30000]
217
+ # lang = get_lang(docs)
218
+ # result = {}
219
+ # docs = segment(docs, lang=lang)
220
+ # if len(docs) < 2:
221
+ # return result
222
+ # if lang == "vi":
223
+ # features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
224
+ # vectors = model.encode(features, show_progress_bar=False)
225
+ # else:
226
+ # features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
227
+ # vectors = model_en.encode(features, show_progress_bar=False)
228
+ # clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
229
+ # linkage='single', distance_threshold=distance_threshold)
230
+ # clusteror.fit(vectors)
231
+ # print(clusteror.n_clusters_)
232
+ # for i in range(clusteror.n_clusters_):
233
+ # result[str(i + 1)] = []
234
+ # for i in range(len(clusteror.labels_)):
235
+ # cluster_no = clusteror.labels_[i]
236
+ # if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
237
+ # response_doc = {}
238
+ # response_doc = docs[i]
239
+ # if 'domain' in docs[i]:
240
+ # response_doc['domain'] = docs[i]['domain']
241
+ # if 'url' in docs[i]:
242
+ # response_doc['url'] = docs[i]['url']
243
+ # if 'title' in docs[i]:
244
+ # response_doc['title'] = clean_text(docs[i]['title'])
245
+ # if 'snippet' in docs[i]:
246
+ # response_doc['snippet'] = clean_text(docs[i]['snippet'])
247
+ # if 'created_time' in docs[i]:
248
+ # response_doc['created_time'] = docs[i]['created_time']
249
+ # if 'message' in docs[i]:
250
+ # title = docs[i].get('title','')
251
+ # snippet = docs[i].get('snippet','')
252
+ # message = docs[i].get('message','')
253
+ # if title.strip():
254
+ # split_mess = message.split(title)
255
+ # if len(split_mess) > 1:
256
+ # message = title.join(split_mess[1:])
257
+ # if snippet.strip():
258
+ # split_mess = message.split(snippet)
259
+ # if len(split_mess) > 1:
260
+ # message = snippet.join(split_mess[1:])
261
+
262
+ # response_doc['message'] = clean_text(message)
263
+ # if 'id' in docs[i]:
264
+ # response_doc['id'] = docs[i]['id']
265
+ # response_doc['score'] = 0.0
266
+ # response_doc['title_summarize'] = []
267
+ # response_doc['content_summary'] = ""
268
+ # response_doc['total_facebook_viral'] = 0
269
+ # result[str(cluster_no + 1)].append(response_doc)
270
+
271
+ # empty_clus_ids = []
272
+ # for x in result:
273
+ # result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
274
+ # if len( result[x]) > 0:
275
+ # if len(result[x]) > 1:
276
+ # result[x] = check_duplicate_title_domain(result[x])
277
+ # result[x][0]['num_docs'] = len(result[x])
278
+ # else:
279
+ # empty_clus_ids.append(x)
280
+
281
+ # for x in empty_clus_ids:
282
+ # result.pop(x,None)
283
+ # # result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
284
+ # return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field)
285
+
286
+ def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=True):
287
+ global model, model_en
288
+ docs = docs[:30000]
289
+ lang = get_lang(docs)
290
+ result = {}
291
+ cluster_score = {}
292
+ # docs = segment(docs, lang=lang)
293
+ if len(docs) < 2:
294
+ return result
295
+ if lang == "vi":
296
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
297
+ vectors = model.encode(features, show_progress_bar=False)
298
+ else:
299
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
300
+ vectors = model_en.encode(features, show_progress_bar=False)
301
+ clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
302
+ linkage='single', distance_threshold=distance_threshold)
303
+ clusteror.fit(vectors)
304
+ print(clusteror.n_clusters_)
305
+ for i in range(clusteror.n_clusters_):
306
+ result[str(i + 1)] = []
307
+ cluster_score[str(i + 1)] = 0
308
+ for i in range(len(clusteror.labels_)):
309
+ cluster_no = clusteror.labels_[i]
310
+ if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
311
+ response_doc = {}
312
+ response_doc = docs[i]
313
+ score = response_doc.get('score', 0)
314
+ if not docs[i].get('message','').strip():
315
+ continue
316
+ if score > cluster_score[str(cluster_no + 1)]:
317
+ cluster_score[str(cluster_no + 1)] = score
318
+ if 'domain' in docs[i]:
319
+ response_doc['domain'] = docs[i]['domain']
320
+ if 'url' in docs[i]:
321
+ response_doc['url'] = docs[i]['url']
322
+ if 'title' in docs[i]:
323
+ response_doc['title'] = clean_text(docs[i]['title'])
324
+ if 'snippet' in docs[i]:
325
+ response_doc['snippet'] = clean_text(docs[i]['snippet'])
326
+ if 'created_time' in docs[i]:
327
+ response_doc['created_time'] = docs[i]['created_time']
328
+ if 'message' in docs[i]:
329
+ title = docs[i].get('title','')
330
+ snippet = docs[i].get('snippet','')
331
+ message = docs[i].get('message','')
332
+ if title.strip():
333
+ split_mess = message.split(title)
334
+ if len(split_mess) > 1:
335
+ message = title.join(split_mess[1:])
336
+ if snippet.strip():
337
+ split_mess = message.split(snippet)
338
+ if len(split_mess) > 1:
339
+ message = snippet.join(split_mess[1:])
340
+
341
+ response_doc['message'] = clean_text(message)
342
+ if 'id' in docs[i]:
343
+ response_doc['id'] = docs[i]['id']
344
+ # response_doc['score'] = 0.0
345
+ response_doc['title_summarize'] = []
346
+ response_doc['content_summary'] = ""
347
+ response_doc['total_facebook_viral'] = 0
348
+ result[str(cluster_no + 1)].append(response_doc)
349
+
350
+ empty_clus_ids = []
351
+ for x in result:
352
+ result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
353
+ if len( result[x]) > 0:
354
+ if len(result[x]) > 1:
355
+ result[x] = check_duplicate_title_domain(result[x])
356
+ result[x][0]['num_docs'] = len(result[x])
357
+ result[x][0]['max_score'] = cluster_score[x]
358
+ else:
359
+ empty_clus_ids.append(x)
360
+
361
+ for x in empty_clus_ids:
362
+ result.pop(x,None)
363
+ # result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
364
+ return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster)
365
+
366
+ def check_duplicate_title_domain(docs):
367
+ lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
368
+ for i in range(1,len(lst_title_domain) -1):
369
+ for j in range(i+1,len(lst_title_domain)):
370
+ if lst_title_domain[j] == lst_title_domain[i]:
371
+ lst_title_domain[j] = 'dup'
372
+ lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
373
+ return lst_filter_docs
374
+ def convert_date(text):
375
+ text = text.replace(".", "/")
376
+ text = text.replace("-", "/")
377
+ return text
378
+
379
+
380
+ def check_keyword(sentence):
381
+ keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
382
+ for k in keyword:
383
+ if k in sentence:
384
+ return True
385
+ return False
386
+
387
+
388
+ def extract_events_and_time(docs, publish_date):
389
+ def standardize(date_str):
390
+ return date_str.replace('.', '/').replace('-', '/')
391
+
392
+ def add_0(date_str):
393
+
394
+ date_str = date_str.split('/')
395
+ res = []
396
+ for o in date_str:
397
+ o = re.sub('\s+', '', o)
398
+ if len(o) < 2:
399
+ o = '0' + o
400
+ res.append(o)
401
+ date_str = '/'.join(res)
402
+ return date_str
403
+
404
+ def get_date_list(reg, sentence):
405
+ find_object = re.finditer(reg, sentence)
406
+ date_list = [x.group() for x in find_object]
407
+ return date_list
408
+
409
+ year = publish_date.split('/')[2]
410
+
411
+ # dd/mm/yyyy
412
+ reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
413
+ # #mm/yyyy
414
+ # reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
415
+ # dd/mm
416
+ reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'
417
+
418
+ # ngày dd tháng mm năm yyyy
419
+ reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
420
+ # ngày dd tháng mm
421
+ reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'
422
+
423
+ result = []
424
+ for d in docs:
425
+ text = d['message']
426
+ for sentence in sent_tokenize(text):
427
+ lower_sentence = sentence.lower()
428
+ c = re.search(reg_exp_3, sentence.lower())
429
+ d = re.search(reg_exp_4, sentence.lower())
430
+ # e = re.search(reg_exp_5, sentence.lower())
431
+ a = re.search(reg_exp_1, sentence)
432
+ b = re.search(reg_exp_2, sentence)
433
+ #
434
+ if (a or b or c or d) and check_keyword(lower_sentence):
435
+ date_list = get_date_list(reg_exp_1, lower_sentence)
436
+ date_entity = ''
437
+ if date_list:
438
+ date_entity = add_0(standardize(date_list[0]))
439
+ elif get_date_list(reg_exp_2, lower_sentence):
440
+ date_list = get_date_list(reg_exp_2, lower_sentence)
441
+ date_entity = add_0(standardize(date_list[0]) + '/' + year)
442
+ elif get_date_list(reg_exp_3, lower_sentence):
443
+ date_list = get_date_list(reg_exp_3, lower_sentence)
444
+
445
+ date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
446
+ date_entity = re.sub('\s+', ' ', date_entity)
447
+ date_entity = date_entity.replace(' ', '/')
448
+ date_entity = add_0(date_entity)
449
+ else:
450
+ date_list = get_date_list(reg_exp_4, lower_sentence)
451
+ if date_list != []:
452
+ date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
453
+ date_entity = re.sub('\s+', ' ', date_entity)
454
+ date_entity = date_entity.replace(' ', '/')
455
+ date_entity = date_entity + '/' + year
456
+ date_entity = add_0(date_entity)
457
+ result.append((sentence, date_entity))
458
+ return result
function/topic_clustering_mnews.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from vncorenlp import VnCoreNLP
3
+ from nltk.tokenize import sent_tokenize
4
+ import torch
5
+ from sentence_transformers import SentenceTransformer
6
+ import datetime
7
+ from sklearn.cluster import AgglomerativeClustering
8
+
9
+ import numpy as np
10
+ import requests
11
+ import json
12
+ from . import utils
13
+ import time
14
+ from summary import text_summary, get_summary_bert
15
+ # from . import detect_time as dt
16
+
17
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
18
+ model = SentenceTransformer('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base').to(device)
19
+ model_en = SentenceTransformer('paraphrase-mpnet-base-v2').to(device)
20
+ annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx8g')
21
+
22
+
23
+ def detect_postaging(text_in):
24
+ word_segmented_text = annotator.annotate(text_in)
25
+ lst_k = []
26
+ for se in word_segmented_text["sentences"]:
27
+ for kw in se:
28
+ if kw["posTag"] in ("Np", "Ny", "N"):
29
+ if kw["posTag"] == "N" and "_" not in kw["form"]:
30
+ continue
31
+ lst_k.append(kw["form"].replace("_", " "))
32
+ return list(set(lst_k))
33
+
34
+ def clean_text(text_in):
35
+ doc = re.sub('<.*?>', '', text_in)
36
+ doc = re.sub('(function).*}', ' ', doc)
37
+ # link
38
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
39
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
40
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
41
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
42
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
43
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
44
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
45
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
46
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
47
+
48
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
49
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
50
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
51
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
52
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
53
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
54
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
55
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
56
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
57
+ # escape sequence
58
+ doc = re.sub('\n', ' ', doc)
59
+ doc = re.sub('\t', ' ', doc)
60
+ doc = re.sub('\r', ' ', doc)
61
+ return doc
62
+
63
+
64
+ def data_cleaning(docs):
65
+ res = []
66
+ for d in docs:
67
+ if 'message' in d:
68
+ # css and js
69
+ doc = re.sub('<.*?>', '', d['message'])
70
+ doc = re.sub('(function).*}', ' ', doc)
71
+
72
+ # link
73
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
74
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
75
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
76
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
77
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
78
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
79
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
80
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
81
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
82
+
83
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
84
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
85
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
86
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
87
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
88
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
89
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
90
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
91
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
92
+ # escape sequence
93
+ doc = re.sub('\n', ' ', doc)
94
+ doc = re.sub('\t', ' ', doc)
95
+ doc = re.sub('\r', ' ', doc)
96
+
97
+ d['message'] = doc
98
+ res.append(d)
99
+ return res
100
+
101
+
102
+ def segment(docs, lang="vi"):
103
+ segmented_docs = []
104
+ for d in docs:
105
+ # if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
106
+ # continue
107
+ if 'snippet' not in d and 'title' not in d:
108
+ continue
109
+ try:
110
+ if lang == "vi":
111
+ snippet = d.get('snippet', "")
112
+ segmented_snippet = ""
113
+ segmented_sentences_snippet = annotator.tokenize(snippet)
114
+ for sentence in segmented_sentences_snippet:
115
+ segmented_snippet += ' ' + ' '.join(sentence)
116
+ segmented_snippet = segmented_snippet.replace('\xa0', '')
117
+ d['segmented_snippet'] = segmented_snippet
118
+ segmented_docs.append(d)
119
+ except Exception:
120
+ pass
121
+ return segmented_docs
122
+
123
+
124
+ def timestamp_to_date(timestamp):
125
+ return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')
126
+
127
+
128
+ def sort_content(lst_res):
129
+ lst_content = []
130
+ lst_cnt = []
131
+ for i in range(len(lst_res)):
132
+ lst_cnt.append(len(lst_res[i].get("message", "")))
133
+ id_sort = np.argsort(np.array(lst_cnt))[::-1]
134
+ for i in id_sort:
135
+ lst_content.append(lst_res[i])
136
+ return lst_content
137
+
138
+
139
+
140
+ def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5):
141
+ lst_ids = []
142
+ lst_top = []
143
+ lst_res = []
144
+ for i in response:
145
+ lst_ids.append(i)
146
+ lst_top.append(len(response[i]))
147
+ idx = np.argsort(np.array(lst_top))[::-1]
148
+ if top_cluster == -1:
149
+ top_cluster = len(idx)
150
+ for i in idx[: top_cluster]:
151
+ ik = lst_ids[i]
152
+ if top_sentence == -1:
153
+ top_sentence = len(response[ik])
154
+ lst_check_title = []
155
+ lst_check_not_title = []
156
+ i_c_t = 0
157
+ response_sort = sort_content(response[ik].copy())
158
+ for resss in response_sort:
159
+ if resss.get("title", ""):
160
+ lst_check_title.append(resss)
161
+ i_c_t += 1
162
+ else:
163
+ lst_check_not_title.append(resss)
164
+ if i_c_t == top_sentence:
165
+ break
166
+ if i_c_t == top_sentence:
167
+ lst_res.append(lst_check_title)
168
+ else:
169
+ lst_check_title.extend(lst_check_not_title)
170
+ lst_res.append(lst_check_title[:top_sentence])
171
+ dict_res = {}
172
+ for i in range(len(lst_res)):
173
+ dict_res[str(i + 1)] = lst_res[i]
174
+ for j in range(min(len(dict_res[str(i + 1)]), 3)):
175
+ dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
176
+ summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), lang = dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary)
177
+ if len(summary_text) < 10:
178
+ summary_text = dict_res[str(i + 1)][0].get("snippet", "")
179
+ if len(summary_text) < 10:
180
+ summary_text = dict_res[str(i + 1)][0].get("title", "")
181
+ dict_res[str(i + 1)][0]["content_summary"] = utils.remove_image_keyword(summary_text)
182
+ kew_phares = []
183
+ dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
184
+ for j in range(len(dict_res[str(i + 1)])):
185
+ if "message" in dict_res[str(i + 1)][j]:
186
+ del dict_res[str(i + 1)][j]["message"]
187
+ return dict_res
188
+
189
+
190
+ def get_lang(docs):
191
+ lang_vi = 0
192
+ lang_en = 0
193
+ docs_lang_vi = []
194
+ docs_lang_en = []
195
+ for d in docs:
196
+ if d.get("lang", "") == "en":
197
+ lang_en += 1
198
+ docs_lang_en.append(d)
199
+ else:
200
+ lang_vi += 1
201
+ docs_lang_vi.append(d)
202
+ if lang_vi > lang_en:
203
+ return "vi", docs_lang_vi
204
+ return "en", docs_lang_en
205
+
206
+
207
+ def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, benchmark_id=1):
208
+ global model, model_en
209
+ lang, docs = get_lang(docs)
210
+ result = {}
211
+ docs = segment(docs, lang=lang)
212
+ print("docs segment: ", len(docs))
213
+ if len(docs) < 2:
214
+ return result
215
+ if lang == "vi":
216
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
217
+ vectors = model.encode(features, show_progress_bar=False)
218
+ else:
219
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
220
+ vectors = model_en.encode(features, show_progress_bar=False)
221
+ clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
222
+ linkage='single', distance_threshold=distance_threshold)
223
+ clusteror.fit(vectors)
224
+ print(clusteror.n_clusters_)
225
+ for i in range(clusteror.n_clusters_):
226
+ result[str(i + 1)] = []
227
+ for i in range(len(clusteror.labels_)):
228
+ cluster_no = clusteror.labels_[i]
229
+ response_doc = {}
230
+ if 'url' in docs[i]:
231
+ response_doc['url'] = docs[i]['url']
232
+ if 'domain' in docs[i]:
233
+ response_doc['domain'] = docs[i]['domain']
234
+ if 'title' in docs[i]:
235
+ response_doc['title'] = clean_text(docs[i]['title'])
236
+ if 'snippet' in docs[i]:
237
+ response_doc['snippet'] = clean_text(docs[i]['snippet'])
238
+ if 'created_time' in docs[i]:
239
+ response_doc['created_time'] = docs[i]['created_time']
240
+ if 'message' in docs[i]:
241
+ response_doc['message'] = clean_text(docs[i]['message'])
242
+ if 'id' in docs[i]:
243
+ response_doc['id'] = docs[i]['id']
244
+ response_doc['score'] = 0.0
245
+ response_doc['title_summarize'] = []
246
+ response_doc['content_summary'] = ""
247
+ response_doc['total_facebook_viral'] = 0
248
+ result[str(cluster_no + 1)].append(response_doc)
249
+ # print("before filter: ", len(result))
250
+ # result = smart_filter(result, benchmark_id=benchmark_id)
251
+ # print("after filter: ", len(result))
252
+ return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary)
253
+
254
+
255
+ def convert_date(text):
256
+ text = text.replace(".", "/")
257
+ text = text.replace("-", "/")
258
+ return text
259
+
260
+
261
+ def check_keyword(sentence):
262
+ keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
263
+ for k in keyword:
264
+ if k in sentence:
265
+ return True
266
+ return False
267
+
268
+
269
+ def extract_events_and_time(docs, publish_date):
270
+ def standardize(date_str):
271
+ return date_str.replace('.', '/').replace('-', '/')
272
+
273
+ def add_0(date_str):
274
+
275
+ date_str = date_str.split('/')
276
+ res = []
277
+ for o in date_str:
278
+ o = re.sub('\s+', '', o)
279
+ if len(o) < 2:
280
+ o = '0' + o
281
+ res.append(o)
282
+ date_str = '/'.join(res)
283
+ return date_str
284
+
285
+ def get_date_list(reg, sentence):
286
+ find_object = re.finditer(reg, sentence)
287
+ date_list = [x.group() for x in find_object]
288
+ return date_list
289
+
290
+ year = publish_date.split('/')[2]
291
+
292
+ # dd/mm/yyyy
293
+ reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
294
+ # #mm/yyyy
295
+ # reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
296
+ # dd/mm
297
+ reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'
298
+
299
+ # ngày dd tháng mm năm yyyy
300
+ reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
301
+ # ngày dd tháng mm
302
+ reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'
303
+
304
+ result = []
305
+ for d in docs:
306
+ text = d['message']
307
+ for sentence in sent_tokenize(text):
308
+ lower_sentence = sentence.lower()
309
+ c = re.search(reg_exp_3, sentence.lower())
310
+ d = re.search(reg_exp_4, sentence.lower())
311
+ # e = re.search(reg_exp_5, sentence.lower())
312
+ a = re.search(reg_exp_1, sentence)
313
+ b = re.search(reg_exp_2, sentence)
314
+ #
315
+ if (a or b or c or d) and check_keyword(lower_sentence):
316
+ date_list = get_date_list(reg_exp_1, lower_sentence)
317
+ date_entity = ''
318
+ if date_list:
319
+ date_entity = add_0(standardize(date_list[0]))
320
+ elif get_date_list(reg_exp_2, lower_sentence):
321
+ date_list = get_date_list(reg_exp_2, lower_sentence)
322
+ date_entity = add_0(standardize(date_list[0]) + '/' + year)
323
+ elif get_date_list(reg_exp_3, lower_sentence):
324
+ date_list = get_date_list(reg_exp_3, lower_sentence)
325
+
326
+ date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
327
+ date_entity = re.sub('\s+', ' ', date_entity)
328
+ date_entity = date_entity.replace(' ', '/')
329
+ date_entity = add_0(date_entity)
330
+ else:
331
+ date_list = get_date_list(reg_exp_4, lower_sentence)
332
+ if date_list != []:
333
+ date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
334
+ date_entity = re.sub('\s+', ' ', date_entity)
335
+ date_entity = date_entity.replace(' ', '/')
336
+ date_entity = date_entity + '/' + year
337
+ date_entity = add_0(date_entity)
338
+ result.append((sentence, date_entity))
339
+ return result
function/topic_clustering_not_summary.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from email import message
2
+ import re
3
+ from turtle import title
4
+ from vncorenlp import VnCoreNLP
5
+ from nltk.tokenize import sent_tokenize
6
+ import torch
7
+ from sentence_transformers import SentenceTransformer
8
+ import datetime
9
+ from sklearn.cluster import AgglomerativeClustering
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ import numpy as np
12
+ import requests
13
+ import json
14
+ from . import utils
15
+ import time
16
+ from summary import text_summary, get_summary_bert
17
+ from function.clean_text import normalize_text
18
+ # from . import detect_time as dt
19
+
20
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
+ # model = SentenceTransformer('model/distiluse-base-multilingual-cased-v2').to(device)
22
+ model = SentenceTransformer('model/paraphrase-multilingual-MiniLM-L12-v2')
23
+
24
+ # model = SentenceTransformer('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base').to(device)
25
+ # model.save('model/distiluse-base-multilingual-cased-v2')
26
+
27
+ use_cuda = torch.cuda.is_available()
28
+ print(torch.cuda.is_available())
29
+ if torch.cuda.is_available():
30
+ model_en = SentenceTransformer('model/paraphrase-mpnet-base-v2').to(device)
31
+ else:
32
+ model_en = model
33
+ # model_en.save('model/paraphrase-mpnet-base-v2')
34
+ annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx8g')
35
+
36
+
37
+ def detect_postaging(text_in):
38
+ word_segmented_text = annotator.annotate(text_in)
39
+ lst_k = []
40
+ for se in word_segmented_text["sentences"]:
41
+ for kw in se:
42
+ if kw["posTag"] in ("Np", "Ny", "N"):
43
+ if kw["posTag"] == "N" and "_" not in kw["form"]:
44
+ continue
45
+ lst_k.append(kw["form"].replace("_", " "))
46
+ return list(set(lst_k))
47
+
48
+ def clean_text(text_in):
49
+ doc = re.sub('<.*?>', '', text_in)
50
+ doc = re.sub('(function).*}', ' ', doc)
51
+ # link
52
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
53
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
54
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
55
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
56
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
57
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
58
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
59
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
60
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
61
+
62
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
63
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
64
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
65
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
66
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
67
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
68
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
69
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
70
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
71
+ # escape sequence
72
+ doc = re.sub('\n', ' ', doc)
73
+ doc = re.sub('\t', ' ', doc)
74
+ doc = re.sub('\r', ' ', doc)
75
+
76
+ doc = normalize_text(doc)
77
+ return doc
78
+
79
+
80
+ def data_cleaning(docs):
81
+ res = []
82
+ for d in docs:
83
+ if 'message' in d:
84
+ # css and js
85
+ doc = re.sub('<.*?>', '', d['message'])
86
+ doc = re.sub('(function).*}', ' ', doc)
87
+
88
+ # link
89
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
90
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
91
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
92
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
93
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
94
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
95
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
96
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
97
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
98
+
99
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
100
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
101
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
102
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
103
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
104
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
105
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
106
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
107
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
108
+ # escape sequence
109
+ doc = re.sub('\n', ' ', doc)
110
+ doc = re.sub('\t', ' ', doc)
111
+ doc = re.sub('\r', ' ', doc)
112
+
113
+ d['message'] = doc
114
+ res.append(d)
115
+ return res
116
+
117
+
118
+ def segment(docs, lang="vi"):
119
+ segmented_docs = []
120
+ for d in docs:
121
+ # if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
122
+ if len(d.get('message', "")) > 8000:
123
+ continue
124
+ if 'snippet' not in d:
125
+ continue
126
+ try:
127
+ if lang == "vi":
128
+ snippet = d.get('snippet', "")
129
+ segmented_snippet = ""
130
+ segmented_sentences_snippet = annotator.tokenize(snippet)
131
+ for sentence in segmented_sentences_snippet:
132
+ segmented_snippet += ' ' + ' '.join(sentence)
133
+ segmented_snippet = segmented_snippet.replace('\xa0', '')
134
+ d['segmented_snippet'] = segmented_snippet
135
+ segmented_docs.append(d)
136
+ except Exception:
137
+ pass
138
+ return segmented_docs
139
+
140
+
141
+ def timestamp_to_date(timestamp):
142
+ return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')
143
+
144
+
145
+ def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster = 50, delete_message=True):
146
+ print(f'[INFO] sorted_field: {sorted_field}')
147
+ MAX_DOC_PER_CLUSTER = max_doc_per_cluster
148
+
149
+ lst_ids = []
150
+ lst_top = []
151
+ lst_res = []
152
+ for i in response:
153
+ lst_ids.append(i)
154
+
155
+ if not sorted_field.strip():
156
+ lst_top.append(len(response[i]))
157
+ else:
158
+ lst_top.append(response[i][0]['max_score'])
159
+
160
+ idx = np.argsort(np.array(lst_top))[::-1]
161
+ if top_cluster == -1:
162
+ top_cluster = len(idx)
163
+ for i in idx[: top_cluster]:
164
+ ik = lst_ids[i]
165
+ if top_sentence == -1:
166
+ top_sentence = len(response[ik])
167
+ lst_check_title = []
168
+ lst_check_not_title = []
169
+ i_c_t = 0
170
+ for resss in response[ik]:
171
+ r_title = resss.get("title", "")
172
+ if r_title and not r_title.endswith("..."):
173
+ lst_check_title.append(resss)
174
+ i_c_t += 1
175
+ else:
176
+ lst_check_not_title.append(resss)
177
+ if i_c_t == top_sentence:
178
+ break
179
+ if i_c_t == top_sentence:
180
+ lst_res.append(lst_check_title)
181
+ else:
182
+ lst_check_title.extend(lst_check_not_title)
183
+ lst_res.append(lst_check_title[:top_sentence])
184
+ #lst_res.append(response[ik][:top_sentence])
185
+ dict_res = {}
186
+ for i in range(len(lst_res)):
187
+ dict_res[str(i + 1)] = lst_res[i][:MAX_DOC_PER_CLUSTER]
188
+ for j in range(min(len(dict_res[str(i + 1)]), 3)):
189
+ dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
190
+ summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), lang=get_summary_bert(dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary, title=dict_res[str(i + 1)][0].get("title", ""), snippet=dict_res[str(i + 1)][0].get("snippet", ""))
191
+ if len(summary_text) < 10:
192
+ summary_text = dict_res[str(i + 1)][0].get("snippet", "")
193
+ if len(summary_text) < 10:
194
+ summary_text = dict_res[str(i + 1)][0].get("title", "")
195
+ dict_res[str(i + 1)][0]["content_summary"] = utils.remove_image_keyword(summary_text)
196
+ kew_phares = []
197
+ dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
198
+
199
+ if delete_message:
200
+ for j in range(len(dict_res[str(i + 1)])):
201
+ if "message" in dict_res[str(i + 1)][j]:
202
+ del dict_res[str(i + 1)][j]["message"]
203
+ return dict_res
204
+
205
+
206
+ def get_lang(docs):
207
+ lang_vi = 0
208
+ lang_en = 0
209
+ for d in docs:
210
+ if d.get("lang", "") == "vi":
211
+ lang_vi += 1
212
+ else:
213
+ lang_en += 1
214
+ if lang_vi >= lang_en:
215
+ return "vi"
216
+ return "en"
217
+
218
+
219
+ # def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field=''):
220
+ # global model, model_en
221
+ # docs = docs[:30000]
222
+ # lang = get_lang(docs)
223
+ # result = {}
224
+ # docs = segment(docs, lang=lang)
225
+ # if len(docs) < 2:
226
+ # return result
227
+ # if lang == "vi":
228
+ # features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
229
+ # vectors = model.encode(features, show_progress_bar=False)
230
+ # else:
231
+ # features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
232
+ # vectors = model_en.encode(features, show_progress_bar=False)
233
+ # clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
234
+ # linkage='single', distance_threshold=distance_threshold)
235
+ # clusteror.fit(vectors)
236
+ # print(clusteror.n_clusters_)
237
+ # for i in range(clusteror.n_clusters_):
238
+ # result[str(i + 1)] = []
239
+ # for i in range(len(clusteror.labels_)):
240
+ # cluster_no = clusteror.labels_[i]
241
+ # if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
242
+ # response_doc = {}
243
+ # response_doc = docs[i]
244
+ # if 'domain' in docs[i]:
245
+ # response_doc['domain'] = docs[i]['domain']
246
+ # if 'url' in docs[i]:
247
+ # response_doc['url'] = docs[i]['url']
248
+ # if 'title' in docs[i]:
249
+ # response_doc['title'] = clean_text(docs[i]['title'])
250
+ # if 'snippet' in docs[i]:
251
+ # response_doc['snippet'] = clean_text(docs[i]['snippet'])
252
+ # if 'created_time' in docs[i]:
253
+ # response_doc['created_time'] = docs[i]['created_time']
254
+ # if 'message' in docs[i]:
255
+ # title = docs[i].get('title','')
256
+ # snippet = docs[i].get('snippet','')
257
+ # message = docs[i].get('message','')
258
+ # if title.strip():
259
+ # split_mess = message.split(title)
260
+ # if len(split_mess) > 1:
261
+ # message = title.join(split_mess[1:])
262
+ # if snippet.strip():
263
+ # split_mess = message.split(snippet)
264
+ # if len(split_mess) > 1:
265
+ # message = snippet.join(split_mess[1:])
266
+
267
+ # response_doc['message'] = clean_text(message)
268
+ # if 'id' in docs[i]:
269
+ # response_doc['id'] = docs[i]['id']
270
+ # response_doc['score'] = 0.0
271
+ # response_doc['title_summarize'] = []
272
+ # response_doc['content_summary'] = ""
273
+ # response_doc['total_facebook_viral'] = 0
274
+ # result[str(cluster_no + 1)].append(response_doc)
275
+
276
+ # empty_clus_ids = []
277
+ # for x in result:
278
+ # result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
279
+ # if len( result[x]) > 0:
280
+ # if len(result[x]) > 1:
281
+ # result[x] = check_duplicate_title_domain(result[x])
282
+ # result[x][0]['num_docs'] = len(result[x])
283
+ # else:
284
+ # empty_clus_ids.append(x)
285
+
286
+ # for x in empty_clus_ids:
287
+ # result.pop(x,None)
288
+ # # result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
289
+ # return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field)
290
+
291
+ def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=True):
292
+ global model, model_en
293
+ docs = docs[:30000]
294
+ lang = get_lang(docs)
295
+ result = {}
296
+ cluster_score = {}
297
+ docs = segment(docs, lang=lang)
298
+ if len(docs) < 2:
299
+ return result
300
+ if lang == "vi":
301
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
302
+ vectors = model.encode(features, show_progress_bar=False)
303
+ else:
304
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
305
+ vectors = model_en.encode(features, show_progress_bar=False)
306
+ clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
307
+ linkage='single', distance_threshold=distance_threshold)
308
+ clusteror.fit(vectors)
309
+ print(clusteror.n_clusters_)
310
+ for i in range(clusteror.n_clusters_):
311
+ result[str(i + 1)] = []
312
+ cluster_score[str(i + 1)] = 0
313
+ for i in range(len(clusteror.labels_)):
314
+ cluster_no = clusteror.labels_[i]
315
+ if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
316
+ response_doc = {}
317
+ response_doc = docs[i]
318
+ score = response_doc.get('score', 0)
319
+ if not docs[i].get('message','').strip():
320
+ continue
321
+ if score > cluster_score[str(cluster_no + 1)]:
322
+ cluster_score[str(cluster_no + 1)] = score
323
+ if 'domain' in docs[i]:
324
+ response_doc['domain'] = docs[i]['domain']
325
+ if 'url' in docs[i]:
326
+ response_doc['url'] = docs[i]['url']
327
+ if 'title' in docs[i]:
328
+ response_doc['title'] = clean_text(docs[i]['title'])
329
+ if 'snippet' in docs[i]:
330
+ response_doc['snippet'] = clean_text(docs[i]['snippet'])
331
+ if 'created_time' in docs[i]:
332
+ response_doc['created_time'] = docs[i]['created_time']
333
+ if 'message' in docs[i]:
334
+ title = docs[i].get('title','')
335
+ snippet = docs[i].get('snippet','')
336
+ message = docs[i].get('message','')
337
+ if title.strip():
338
+ split_mess = message.split(title)
339
+ if len(split_mess) > 1:
340
+ message = title.join(split_mess[1:])
341
+ if snippet.strip():
342
+ split_mess = message.split(snippet)
343
+ if len(split_mess) > 1:
344
+ message = snippet.join(split_mess[1:])
345
+
346
+ response_doc['message'] = clean_text(message)
347
+ if 'id' in docs[i]:
348
+ response_doc['id'] = docs[i]['id']
349
+ # response_doc['score'] = 0.0
350
+ response_doc['title_summarize'] = []
351
+ response_doc['content_summary'] = ""
352
+ response_doc['total_facebook_viral'] = 0
353
+ result[str(cluster_no + 1)].append(response_doc)
354
+
355
+ empty_clus_ids = []
356
+ for x in result:
357
+ result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
358
+ if len( result[x]) > 0:
359
+ if len(result[x]) > 1:
360
+ result[x] = check_duplicate_title_domain(result[x])
361
+ result[x][0]['num_docs'] = len(result[x])
362
+ result[x][0]['max_score'] = cluster_score[x]
363
+ else:
364
+ empty_clus_ids.append(x)
365
+
366
+ for x in empty_clus_ids:
367
+ result.pop(x,None)
368
+ # result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
369
+ return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=delete_message)
370
+
371
+ def check_duplicate_title_domain(docs):
372
+ lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
373
+ for i in range(1,len(lst_title_domain) -1):
374
+ for j in range(i+1,len(lst_title_domain)):
375
+ if lst_title_domain[j] == lst_title_domain[i]:
376
+ lst_title_domain[j] = 'dup'
377
+ lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
378
+ return lst_filter_docs
379
+ def convert_date(text):
380
+ text = text.replace(".", "/")
381
+ text = text.replace("-", "/")
382
+ return text
383
+
384
+
385
+ def check_keyword(sentence):
386
+ keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
387
+ for k in keyword:
388
+ if k in sentence:
389
+ return True
390
+ return False
391
+
392
+
393
+ def extract_events_and_time(docs, publish_date):
394
+ def standardize(date_str):
395
+ return date_str.replace('.', '/').replace('-', '/')
396
+
397
+ def add_0(date_str):
398
+
399
+ date_str = date_str.split('/')
400
+ res = []
401
+ for o in date_str:
402
+ o = re.sub('\s+', '', o)
403
+ if len(o) < 2:
404
+ o = '0' + o
405
+ res.append(o)
406
+ date_str = '/'.join(res)
407
+ return date_str
408
+
409
+ def get_date_list(reg, sentence):
410
+ find_object = re.finditer(reg, sentence)
411
+ date_list = [x.group() for x in find_object]
412
+ return date_list
413
+
414
+ year = publish_date.split('/')[2]
415
+
416
+ # dd/mm/yyyy
417
+ reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
418
+ # #mm/yyyy
419
+ # reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
420
+ # dd/mm
421
+ reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'
422
+
423
+ # ngày dd tháng mm năm yyyy
424
+ reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
425
+ # ngày dd tháng mm
426
+ reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'
427
+
428
+ result = []
429
+ for d in docs:
430
+ text = d['message']
431
+ for sentence in sent_tokenize(text):
432
+ lower_sentence = sentence.lower()
433
+ c = re.search(reg_exp_3, sentence.lower())
434
+ d = re.search(reg_exp_4, sentence.lower())
435
+ # e = re.search(reg_exp_5, sentence.lower())
436
+ a = re.search(reg_exp_1, sentence)
437
+ b = re.search(reg_exp_2, sentence)
438
+ #
439
+ if (a or b or c or d) and check_keyword(lower_sentence):
440
+ date_list = get_date_list(reg_exp_1, lower_sentence)
441
+ date_entity = ''
442
+ if date_list:
443
+ date_entity = add_0(standardize(date_list[0]))
444
+ elif get_date_list(reg_exp_2, lower_sentence):
445
+ date_list = get_date_list(reg_exp_2, lower_sentence)
446
+ date_entity = add_0(standardize(date_list[0]) + '/' + year)
447
+ elif get_date_list(reg_exp_3, lower_sentence):
448
+ date_list = get_date_list(reg_exp_3, lower_sentence)
449
+
450
+ date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
451
+ date_entity = re.sub('\s+', ' ', date_entity)
452
+ date_entity = date_entity.replace(' ', '/')
453
+ date_entity = add_0(date_entity)
454
+ else:
455
+ date_list = get_date_list(reg_exp_4, lower_sentence)
456
+ if date_list != []:
457
+ date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
458
+ date_entity = re.sub('\s+', ' ', date_entity)
459
+ date_entity = date_entity.replace(' ', '/')
460
+ date_entity = date_entity + '/' + year
461
+ date_entity = add_0(date_entity)
462
+ result.append((sentence, date_entity))
463
+ return result
function/topic_clustering_social.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+ from .utils import get_sbert_embedding, clean_text
4
+ from sklearn.cluster import AgglomerativeClustering
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ from nltk import sent_tokenize
7
+ import requests
8
+ # from clean_text import normalize_text
9
+
10
+ MAX_LENGTH_FEATURE = 250
11
+ MIN_LENGTH_FEATURE = 100
12
+ URL_CHECK_SPAM = "http://10.9.3.70:30036/predict"
13
+
14
+ def check_spam(docs):
15
+ json_body = {
16
+ "domain_id": "",
17
+ "records": [
18
+ {
19
+ "text": doc.get("message",""),
20
+ "idxcol": 1
21
+ } for doc in docs
22
+ ]
23
+ }
24
+
25
+ result = requests.post(URL_CHECK_SPAM, json = json_body).json()
26
+ docs = [x for i,x in enumerate(docs) if result[i]["label"] == 0]
27
+ return docs
28
+
29
+ def preocess_feature(doc):
30
+ message = doc.get("message","")
31
+ paras = message.split("\n")
32
+ feature = ""
33
+ paras = [clean_text(x.strip(), normalize=False) for x in paras if x.strip() and len(x.strip()) > 10]
34
+ for para in paras:
35
+ if len(feature) + len(para) < MAX_LENGTH_FEATURE:
36
+ feature += " " +para
37
+ elif len(feature) < MIN_LENGTH_FEATURE:
38
+ sens = sent_tokenize(para)
39
+ for sen in sens:
40
+ if len(feature) + len(sen) < MAX_LENGTH_FEATURE or len(feature.strip()) < MIN_LENGTH_FEATURE:
41
+ feature += " " +sen
42
+ return feature
43
+
44
+ def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=True, is_check_spam = True):
45
+ # global model, model_en
46
+
47
+ docs = [x for x in docs if len(x.get("message","")) > 100]
48
+ docs = docs[:30000]
49
+ if is_check_spam:
50
+ docs = check_spam(docs)
51
+ result = {}
52
+ cluster_score = {}
53
+
54
+ t1 = time.time()
55
+ if len(docs) < 1:
56
+ return result
57
+ elif len(docs) == 1:
58
+ return {
59
+ "0": docs
60
+ }
61
+
62
+ # features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
63
+
64
+ f_docs = []
65
+ for x in docs:
66
+ ft = preocess_feature(x)
67
+ if len(ft) > MIN_LENGTH_FEATURE:
68
+ x["title"] = ft
69
+ f_docs.append(x)
70
+ docs = f_docs
71
+
72
+ features = [x["title"] for x in docs ]
73
+ # with open("feature", 'w') as f:
74
+ # json.dump(features, f, ensure_ascii = False)
75
+ # print(features)
76
+ vectors = get_sbert_embedding(features)
77
+
78
+ clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
79
+ linkage='complete', distance_threshold=distance_threshold)
80
+ clusteror.fit(vectors)
81
+ print(f"Time encode + clustering: {time.time() - t1} {clusteror.n_clusters_}")
82
+ for i in range(clusteror.n_clusters_):
83
+ result[str(i + 1)] = []
84
+ cluster_score[str(i + 1)] = 0
85
+ for i in range(len(clusteror.labels_)):
86
+ cluster_no = clusteror.labels_[i]
87
+ if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
88
+ response_doc = {}
89
+ response_doc = docs[i]
90
+ score = response_doc.get('score', 0)
91
+ if not docs[i].get('message','').strip():
92
+ continue
93
+ if score > cluster_score[str(cluster_no + 1)]:
94
+ cluster_score[str(cluster_no + 1)] = score
95
+ if 'domain' in docs[i]:
96
+ response_doc['domain'] = docs[i]['domain']
97
+ if 'url' in docs[i]:
98
+ response_doc['url'] = docs[i]['url']
99
+ if 'title' in docs[i]:
100
+ response_doc['title'] = clean_text(docs[i]['title'])
101
+ if 'snippet' in docs[i]:
102
+ response_doc['snippet'] = clean_text(docs[i]['snippet'])
103
+ if 'created_time' in docs[i]:
104
+ response_doc['created_time'] = docs[i]['created_time']
105
+ if "sentiment" in docs[i]:
106
+ response_doc['sentiment'] = docs[i]['sentiment']
107
+ if 'message' in docs[i]:
108
+ title = docs[i].get('title','')
109
+ snippet = docs[i].get('snippet','')
110
+ message = docs[i].get('message','')
111
+ # if title.strip():
112
+ # split_mess = message.split(title)
113
+ # if len(split_mess) > 1:
114
+ # message = title.join(split_mess[1:])
115
+ # if snippet.strip():
116
+ # split_mess = message.split(snippet)
117
+ # if len(split_mess) > 1:
118
+ # message = snippet.join(split_mess[1:])
119
+
120
+ response_doc['message'] = clean_text(message)
121
+ if 'id' in docs[i]:
122
+ response_doc['id'] = docs[i]['id']
123
+ # response_doc['score'] = 0.0
124
+
125
+ # response_doc['title_summarize'] = []
126
+ # response_doc['content_summary'] = ""
127
+ # response_doc['total_facebook_viral'] = 0
128
+ result[str(cluster_no + 1)].append(response_doc)
129
+
130
+ empty_clus_ids = []
131
+ for x in result:
132
+ result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
133
+ if len( result[x]) > 0:
134
+ # if len(result[x]) > 1:
135
+ # result[x] = check_duplicate_title_domain(result[x])
136
+ result[x][0]['num_docs'] = len(result[x])
137
+ result[x][0]['max_score'] = cluster_score[x]
138
+ else:
139
+ empty_clus_ids.append(x)
140
+
141
+ for x in empty_clus_ids:
142
+ result.pop(x,None)
143
+
144
+ result = dict( sorted(result.items(), key=lambda i: -len(i[1]))[:top_cluster])
145
+ return result
146
+ # return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=delete_message)
147
+
148
+ if __name__ == '__main__':
149
+ # with open("/home2/vietle/DA-Report/social.json", 'r') as f:
150
+ # docs = json.load(f)[:2000]
151
+ with open("/home2/vietle/news-cms/topic_summarization/data/news_cms.social.json", 'r') as f:
152
+ docs = json.load(f)[:10000]
153
+ clusters = topic_clustering(docs, distance_threshold=0.2, top_cluster=5000, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=False)
154
+ with open("/home2/vietle/news-cms/topic_summarization/cluster/news_cms.social.json", 'w') as f:
155
+
156
+ json.dump(clusters,f, ensure_ascii =False)
function/topic_clustering_v2.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tensorRT import inference
2
+ import re
3
+ from vncorenlp import VnCoreNLP
4
+ from nltk.tokenize import sent_tokenize
5
+ import torch
6
+ import datetime
7
+ from sklearn.cluster import AgglomerativeClustering
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import numpy as np
10
+ import json
11
+ from . import utils
12
+ import time
13
+ from summary import text_summary, get_summary_bert
14
+ from function.clean_text import normalize_text
15
+
16
+ # from . import detect_time as dt
17
+
18
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
+ use_cuda = torch.cuda.is_available()
20
+ print(torch.cuda.is_available())
21
+
22
+ # annotator = VnCoreNLP('vncorenlp/VnCoreNLP-1.1.1.jar', port=9191, annotators="wseg,pos", max_heap_size='-Xmx2g')
23
+
24
+
25
+ def detect_postaging(text_in):
26
+ word_segmented_text = annotator.annotate(text_in)
27
+ lst_k = []
28
+ for se in word_segmented_text["sentences"]:
29
+ for kw in se:
30
+ if kw["posTag"] in ("Np", "Ny", "N"):
31
+ if kw["posTag"] == "N" and "_" not in kw["form"]:
32
+ continue
33
+ lst_k.append(kw["form"].replace("_", " "))
34
+ return list(set(lst_k))
35
+
36
+ def clean_text(text_in):
37
+ doc = re.sub('<.*?>', '', text_in)
38
+ doc = re.sub('(function).*}', ' ', doc)
39
+ # link
40
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
41
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
42
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
43
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
44
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
45
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
46
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
47
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
48
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
49
+
50
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
51
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
52
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
53
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
54
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
55
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
56
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
57
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
58
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
59
+ # escape sequence
60
+ doc = re.sub('\n', ' ', doc)
61
+ doc = re.sub('\t', ' ', doc)
62
+ doc = re.sub('\r', ' ', doc)
63
+
64
+ doc = normalize_text(doc)
65
+ return doc
66
+
67
+
68
+ def data_cleaning(docs):
69
+ res = []
70
+ for d in docs:
71
+ if 'message' in d:
72
+ # css and js
73
+ doc = re.sub('<.*?>', '', d['message'])
74
+ doc = re.sub('(function).*}', ' ', doc)
75
+
76
+ # link
77
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
78
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
79
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
80
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
81
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
82
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
83
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
84
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
85
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
86
+
87
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
88
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
89
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
90
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
91
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
92
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
93
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
94
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
95
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
96
+ # escape sequence
97
+ doc = re.sub('\n', ' ', doc)
98
+ doc = re.sub('\t', ' ', doc)
99
+ doc = re.sub('\r', ' ', doc)
100
+
101
+ d['message'] = doc
102
+ res.append(d)
103
+ return res
104
+
105
+
106
+ def segment(docs, lang="vi"):
107
+ segmented_docs = []
108
+ for d in docs:
109
+ print(d)
110
+ # if len(d.get('message', "")) > 8000 or len(d.get('message', "")) < 100:
111
+ if len(d.get('message', "")) > 8000:
112
+ continue
113
+ if 'snippet' not in d:
114
+ continue
115
+ try:
116
+ if lang == "vi":
117
+ snippet = d.get('snippet', "")
118
+ segmented_snippet = ""
119
+ segmented_sentences_snippet = annotator.tokenize(snippet)
120
+ for sentence in segmented_sentences_snippet:
121
+ segmented_snippet += ' ' + ' '.join(sentence)
122
+ segmented_snippet = segmented_snippet.replace('\xa0', '')
123
+ d['segmented_snippet'] = segmented_snippet
124
+ segmented_docs.append(d)
125
+ except Exception:
126
+ pass
127
+ return segmented_docs
128
+
129
+
130
+ def timestamp_to_date(timestamp):
131
+ return datetime.datetime.fromtimestamp(timestamp).strftime('%d/%m/%Y')
132
+
133
+
134
+ def post_processing(response, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster = 50, delete_message=True):
135
+ print(f'[INFO] sorted_field: {sorted_field}')
136
+ MAX_DOC_PER_CLUSTER = max_doc_per_cluster
137
+
138
+ lst_ids = []
139
+ lst_top = []
140
+ lst_res = []
141
+ for i in response:
142
+ lst_ids.append(i)
143
+
144
+ if not sorted_field.strip():
145
+ lst_top.append(len(response[i]))
146
+ else:
147
+ lst_top.append(response[i][0]['max_score'])
148
+
149
+ idx = np.argsort(np.array(lst_top))[::-1]
150
+ if top_cluster == -1:
151
+ top_cluster = len(idx)
152
+ for i in idx[: top_cluster]:
153
+ ik = lst_ids[i]
154
+ if top_sentence == -1:
155
+ top_sentence = len(response[ik])
156
+ lst_check_title = []
157
+ lst_check_not_title = []
158
+ i_c_t = 0
159
+ for resss in response[ik]:
160
+ r_title = resss.get("title", "")
161
+ if r_title and not r_title.endswith("..."):
162
+ lst_check_title.append(resss)
163
+ i_c_t += 1
164
+ else:
165
+ lst_check_not_title.append(resss)
166
+ if i_c_t == top_sentence:
167
+ break
168
+ if i_c_t == top_sentence:
169
+ lst_res.append(lst_check_title)
170
+ else:
171
+ lst_check_title.extend(lst_check_not_title)
172
+ lst_res.append(lst_check_title[:top_sentence])
173
+ #lst_res.append(response[ik][:top_sentence])
174
+ dict_res = {}
175
+ for i in range(len(lst_res)):
176
+ dict_res[str(i + 1)] = lst_res[i][:MAX_DOC_PER_CLUSTER]
177
+ for j in range(min(len(dict_res[str(i + 1)]), 3)):
178
+ dict_res[str(i + 1)][0]["title_summarize"].append(dict_res[str(i + 1)][j].get("snippet", ""))
179
+ summary_text = get_summary_bert(dict_res[str(i + 1)][0].get("message", ""), dict_res[str(i + 1)][0].get("lang", "vi"), topn=topn_summary, title=dict_res[str(i + 1)][0].get("title", ""), snippet=dict_res[str(i + 1)][0].get("snippet", ""))
180
+ if len(summary_text) < 10:
181
+ summary_text = dict_res[str(i + 1)][0].get("snippet", "")
182
+ if len(summary_text) < 10:
183
+ summary_text = dict_res[str(i + 1)][0].get("title", "")
184
+ dict_res[str(i + 1)][0]["content_summary"] = utils.remove_image_keyword(summary_text)
185
+ kew_phares = []
186
+ dict_res[str(i + 1)][0]["topic_keywords"] = kew_phares
187
+
188
+ print("delete_message: ", delete_message)
189
+ if delete_message:
190
+ for j in range(len(dict_res[str(i + 1)])):
191
+ if "message" in dict_res[str(i + 1)][j]:
192
+ del dict_res[str(i + 1)][j]["message"]
193
+ return dict_res
194
+
195
+
196
+ def get_lang(docs):
197
+ lang_vi = 0
198
+ lang_en = 0
199
+ for d in docs:
200
+ if d.get("lang", "") == "vi":
201
+ lang_vi += 1
202
+ else:
203
+ lang_en += 1
204
+ if lang_en >= lang_vi:
205
+ return "en"
206
+ return "vi"
207
+
208
+ def topic_clustering(docs, distance_threshold, top_cluster=5, top_sentence=5, topn_summary=5, sorted_field='', max_doc_per_cluster=50, delete_message=True):
209
+ # global model, model_en
210
+ docs = docs[:30000]
211
+ lang = get_lang(docs)
212
+ result = {}
213
+ cluster_score = {}
214
+ # docs = segment(docs, lang=lang)
215
+
216
+ t1 = time.time()
217
+ if len(docs) < 1:
218
+ return result
219
+ elif len(docs) == 1:
220
+ return {
221
+ "0": docs
222
+ }
223
+ if lang == "vi":
224
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
225
+ vectors = inference.encode(features, lang=lang)
226
+ # vectors = model.encode(features, show_progress_bar=False)
227
+ else:
228
+ features = [doc.get('title', "") + ". " + doc.get('snippet', "") for doc in docs]
229
+ vectors = inference.encode(features, lang=lang)
230
+ # vectors = model_en.encode(features, show_progress_bar=False)
231
+ clusteror = AgglomerativeClustering(n_clusters=None, compute_full_tree=True, affinity='cosine',
232
+ linkage='single', distance_threshold=distance_threshold)
233
+ clusteror.fit(vectors)
234
+ print(f"Time encode + clustering: {time.time() - t1} {clusteror.n_clusters_}")
235
+ for i in range(clusteror.n_clusters_):
236
+ result[str(i + 1)] = []
237
+ cluster_score[str(i + 1)] = 0
238
+ for i in range(len(clusteror.labels_)):
239
+ cluster_no = clusteror.labels_[i]
240
+ if docs[i].get('domain','') not in ["cungcau.vn","baomoi.com","news.skydoor.net"]:
241
+ response_doc = {}
242
+ response_doc = docs[i]
243
+ score = response_doc.get('score', 0)
244
+ if not docs[i].get('message','').strip():
245
+ continue
246
+ if score > cluster_score[str(cluster_no + 1)]:
247
+ cluster_score[str(cluster_no + 1)] = score
248
+ if 'domain' in docs[i]:
249
+ response_doc['domain'] = docs[i]['domain']
250
+ if 'url' in docs[i]:
251
+ response_doc['url'] = docs[i]['url']
252
+ if 'title' in docs[i]:
253
+ response_doc['title'] = clean_text(docs[i]['title'])
254
+ if 'snippet' in docs[i]:
255
+ response_doc['snippet'] = clean_text(docs[i]['snippet'])
256
+ if 'created_time' in docs[i]:
257
+ response_doc['created_time'] = docs[i]['created_time']
258
+ if "sentiment" in docs[i]:
259
+ response_doc['sentiment'] = docs[i]['sentiment']
260
+ if 'message' in docs[i]:
261
+ title = docs[i].get('title','')
262
+ snippet = docs[i].get('snippet','')
263
+ message = docs[i].get('message','')
264
+ if title.strip():
265
+ split_mess = message.split(title)
266
+ if len(split_mess) > 1:
267
+ message = title.join(split_mess[1:])
268
+ if snippet.strip():
269
+ split_mess = message.split(snippet)
270
+ if len(split_mess) > 1:
271
+ message = snippet.join(split_mess[1:])
272
+
273
+ response_doc['message'] = clean_text(message)
274
+ if 'id' in docs[i]:
275
+ response_doc['id'] = docs[i]['id']
276
+ # response_doc['score'] = 0.0
277
+ response_doc['title_summarize'] = []
278
+ response_doc['content_summary'] = ""
279
+ response_doc['total_facebook_viral'] = 0
280
+ result[str(cluster_no + 1)].append(response_doc)
281
+
282
+ empty_clus_ids = []
283
+ for x in result:
284
+ result[x] = sorted(result[x], key=lambda i: -len(i.get('message','')))
285
+ if len( result[x]) > 0:
286
+ if len(result[x]) > 1:
287
+ result[x] = check_duplicate_title_domain(result[x])
288
+ result[x][0]['num_docs'] = len(result[x])
289
+ result[x][0]['max_score'] = cluster_score[x]
290
+ else:
291
+ empty_clus_ids.append(x)
292
+
293
+ for x in empty_clus_ids:
294
+ result.pop(x,None)
295
+ # result = dict(sorted(result.items(), key=lambda i: -len(i[1])))[:top_cluster]
296
+ return post_processing(result, top_cluster=top_cluster, top_sentence=top_sentence, topn_summary=topn_summary, sorted_field = sorted_field, max_doc_per_cluster=max_doc_per_cluster, delete_message=delete_message)
297
+
298
+ def check_duplicate_title_domain(docs):
299
+ lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs]
300
+ for i in range(1,len(lst_title_domain) -1):
301
+ for j in range(i+1,len(lst_title_domain)):
302
+ if lst_title_domain[j] == lst_title_domain[i]:
303
+ lst_title_domain[j] = 'dup'
304
+ lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup']
305
+ return lst_filter_docs
306
+ def convert_date(text):
307
+ text = text.replace(".", "/")
308
+ text = text.replace("-", "/")
309
+ return text
310
+
311
+
312
+ def check_keyword(sentence):
313
+ keyword = ['sáng', 'trưa', 'chiều', 'tối', 'đến', 'hôm', 'ngày', 'tới']
314
+ for k in keyword:
315
+ if k in sentence:
316
+ return True
317
+ return False
318
+
319
+
320
+ def extract_events_and_time(docs, publish_date):
321
+ def standardize(date_str):
322
+ return date_str.replace('.', '/').replace('-', '/')
323
+
324
+ def add_0(date_str):
325
+
326
+ date_str = date_str.split('/')
327
+ res = []
328
+ for o in date_str:
329
+ o = re.sub('\s+', '', o)
330
+ if len(o) < 2:
331
+ o = '0' + o
332
+ res.append(o)
333
+ date_str = '/'.join(res)
334
+ return date_str
335
+
336
+ def get_date_list(reg, sentence):
337
+ find_object = re.finditer(reg, sentence)
338
+ date_list = [x.group() for x in find_object]
339
+ return date_list
340
+
341
+ year = publish_date.split('/')[2]
342
+
343
+ # dd/mm/yyyy
344
+ reg_exp_1 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
345
+ # #mm/yyyy
346
+ # reg_exp_5 = '(\D|^)(?:0?[1-9]|1[012])[- \/.]([12]([0-9]){3})(\D|$)'
347
+ # dd/mm
348
+ reg_exp_2 = '(\D|^)(?:0?[1-9]|[12][0-9]|3[01])[- \/.](?:0?[1-9]|1[012])(\D|$)'
349
+
350
+ # ngày dd tháng mm năm yyyy
351
+ reg_exp_3 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}\s*(năm)\s*\d{4}'
352
+ # ngày dd tháng mm
353
+ reg_exp_4 = '(ngày)\s*\d{1,2}\s*(tháng)\s*\d{1,2}'
354
+
355
+ result = []
356
+ for d in docs:
357
+ text = d['message']
358
+ for sentence in sent_tokenize(text):
359
+ lower_sentence = sentence.lower()
360
+ c = re.search(reg_exp_3, sentence.lower())
361
+ d = re.search(reg_exp_4, sentence.lower())
362
+ # e = re.search(reg_exp_5, sentence.lower())
363
+ a = re.search(reg_exp_1, sentence)
364
+ b = re.search(reg_exp_2, sentence)
365
+ #
366
+ if (a or b or c or d) and check_keyword(lower_sentence):
367
+ date_list = get_date_list(reg_exp_1, lower_sentence)
368
+ date_entity = ''
369
+ if date_list:
370
+ date_entity = add_0(standardize(date_list[0]))
371
+ elif get_date_list(reg_exp_2, lower_sentence):
372
+ date_list = get_date_list(reg_exp_2, lower_sentence)
373
+ date_entity = add_0(standardize(date_list[0]) + '/' + year)
374
+ elif get_date_list(reg_exp_3, lower_sentence):
375
+ date_list = get_date_list(reg_exp_3, lower_sentence)
376
+
377
+ date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
378
+ date_entity = re.sub('\s+', ' ', date_entity)
379
+ date_entity = date_entity.replace(' ', '/')
380
+ date_entity = add_0(date_entity)
381
+ else:
382
+ date_list = get_date_list(reg_exp_4, lower_sentence)
383
+ if date_list != []:
384
+ date_entity = date_list[0].replace('ngày', '').replace('tháng', '').replace('năm', '').strip()
385
+ date_entity = re.sub('\s+', ' ', date_entity)
386
+ date_entity = date_entity.replace(' ', '/')
387
+ date_entity = date_entity + '/' + year
388
+ date_entity = add_0(date_entity)
389
+ result.append((sentence, date_entity))
390
+ return result
function/translate.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langdetect import detect
2
+ import requests
3
+ import json
4
+ import time
5
+
6
+ URL_TRANSLATOR = "http://10.9.3.241:8093/translator"
7
+ def detect_lang(text):
8
+ try:
9
+ lang = detect(text)
10
+ except:
11
+ lang = 'en'
12
+ return lang
13
+
14
+ def translate_text_multi_layer(source, target, text, url = URL_TRANSLATOR):
15
+ if source == "":
16
+ source = detect_lang(text)
17
+ print("PPPPPPPPPPPPP")
18
+ if not text.strip() or source == target:
19
+ return text
20
+
21
+ json_body = {
22
+ "doc": text,
23
+ "src_lang": source,
24
+ "tgt_lang": target
25
+ }
26
+ print("CCCCCCCCCCCC")
27
+ res= requests.post(url, json=json_body)
28
+ print("translate: ", res.status_code)
29
+ path_log = f"log_tran/requests_tran_{time.time()}.txt"
30
+ with open(path_log, "w", encoding="utf-8") as f:
31
+ f.write(json.dumps(json_body) + "\n")
32
+ if res.status_code == 200:
33
+ res = res.json()
34
+ with open(path_log, "a", encoding="utf-8") as f:
35
+ f.write(json.dumps(res) + "\n")
36
+ return res
37
+ return ''
function/utils.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import editdistance
2
+ import requests
3
+ import numpy as np
4
+ import re
5
+ from .clean_text import normalize_text
6
+ URL_SBERT = "http://10.9.3.240:6789/sbert/encode_list"
7
+ # app_config.parse_url_api('api_sbert')
8
+
9
+ def get_sbert_embedding(lst_sentence, url = URL_SBERT):
10
+ input_data = {
11
+ "sentences": lst_sentence
12
+ }
13
+ embs = requests.post(url, json=input_data).json()
14
+ embs = np.array(embs)
15
+
16
+ return embs
17
+
18
+ def is_number(word):
19
+ lst_end = ['$', '%', 'vnđ', '.', ',']
20
+ word_lo = word.lower()
21
+ for k in lst_end:
22
+ word_lo = word_lo.replace(k, '')
23
+
24
+ if word_lo.isdigit():
25
+ return True
26
+ else:
27
+ return False
28
+
29
+
30
+ def get_number(text):
31
+ dt = text.split(' ')
32
+ for w in dt:
33
+ if is_number(w):
34
+ return w
35
+
36
+ return ''
37
+
38
+
39
+ def check_editdistance(ww1, ww2):
40
+ if len(ww1) == 0 or len(ww1) == 0:
41
+ return 0
42
+ else:
43
+ n_c = editdistance.eval(ww1.lower(), ww2.lower())
44
+ score = n_c / max(len(ww1), len(ww2))
45
+ return 1 - score
46
+
47
+
48
+ def remove_image_keyword(text_input):
49
+ lst_key = ["ảnh:", "ảnh :", "Ảnh:", "Ảnh :",
50
+ "Ảnh minh họa:", "Ảnh minh họa :", "ảnh minh họa:", "ảnh minh họa :",
51
+ "Nguồn:", "nguồn:", "Nguồn :", "nguồn :",
52
+ "Source:", "Source :", "source:", "source :",
53
+ "Src:", "Src :", "src:", "src :",
54
+ "Image:", "Image :", "img:", "img :",
55
+ "image:", "image :", "Img:", "Img :",
56
+ "xem tiếp", "xem thêm", "Xem tiếp", "Xem thêm"]
57
+ for k in lst_key:
58
+ text_input = text_input.replace(k, " ")
59
+ return text_input.strip()
60
+
61
+ def clean_text(text_in, normalize=True):
62
+ doc = re.sub('<.*?>', '', text_in)
63
+ doc = re.sub('(function).*}', ' ', doc)
64
+ # link
65
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.htm)', ' ', doc)
66
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.html)', ' ', doc)
67
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\/\/)', ' ', doc)
68
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.htm)', ' ', doc)
69
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.html)', ' ', doc)
70
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vn)', ' ', doc)
71
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.net)', ' ', doc)
72
+ doc = re.sub('(Nguồn)\s*?(https:\/\/).*?(\.vgp)', ' ', doc)
73
+ doc = re.sub('(Nguồn)\s*?(http:\/\/).*?(\.vgp)', ' ', doc)
74
+
75
+ doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
76
+ doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
77
+ doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
78
+ doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
79
+ doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
80
+ doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
81
+ doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
82
+ doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
83
+ doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
84
+ # escape sequence
85
+ doc = re.sub('\n', ' ', doc)
86
+ doc = re.sub('\t', ' ', doc)
87
+ doc = re.sub('\r', ' ', doc)
88
+
89
+ if normalize:
90
+ doc = normalize_text(doc)
91
+ return doc
92
+
93
+ if __name__ == '__main__':
94
+ print(check_editdistance('tttt', 'tt'))
get_config.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from yaml import load, Loader
2
+
3
+ config_params = {}
4
+ with open('config/config.yml', encoding='utf-8') as f:
5
+ config_params.update(load(f, Loader=Loader))
iclibs/ic_rabbit.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import pika
3
+ import json
4
+
5
+
6
+ class ICRabbitMQ(object):
7
+ def __init__(self, host, virtual_host, usr, passw, **kwargs):
8
+ """
9
+ Khởi tạo
10
+ :param host: địa chỉ rabbitmq server
11
+ :param virtual_host: virtual_host
12
+ :param queue_name: tên queue
13
+ :param usr: user rabbitmq server
14
+ :param passw: password
15
+ """
16
+ self.host = host
17
+ self.virtual_host = virtual_host
18
+ self.user = usr
19
+ self.passw = passw
20
+ self.credentials = pika.PlainCredentials(usr, passw)
21
+ self.connection = None
22
+ self.kwargs = kwargs
23
+
24
+ def init_connection(self):
25
+ self.connection = \
26
+ pika.BlockingConnection(
27
+ pika.ConnectionParameters(host=self.host, virtual_host=self.virtual_host, credentials=self.credentials))
28
+
29
+ def connection_close(self):
30
+ self.connection.close()
31
+
32
+ def connection_status(self):
33
+ return self.connection.is_open
34
+
35
+ def init_queue(self, queue_name, exchange="", exchange_type='fanout', durable=True, max_priority=-1):
36
+ """
37
+ khởi tạo queue
38
+ :param exchange:
39
+ :param queue_name: tên queue
40
+ :param durable: true (Queue vẫn tồn tại nếu nhưng RabitMQ khởi động lại)
41
+ :param max_priority: Mức độ priority tối đa; None thì không xét priority;
42
+ khác None thì xét priority (tối đa priority = 10)
43
+ :return: channel
44
+ """
45
+ if self.connection is None:
46
+ self.init_connection()
47
+ channel = self.connection.channel()
48
+ if exchange == "" and queue_name != "":
49
+ if max_priority == -1:
50
+ channel.queue_declare(queue=queue_name, durable=durable)
51
+ else:
52
+ channel.queue_declare(queue=queue_name, durable=durable, arguments={'x-max-priority': max_priority})
53
+ else:
54
+ channel.exchange_declare(exchange=exchange, exchange_type='fanout', durable=durable)
55
+ return channel
56
+
57
+ @staticmethod
58
+ def publish_message(channel, routing_key, body, priority=-1, delivery_mode=2, exchange=''):
59
+ """
60
+ run pushlish message
61
+ :param channel: channel đã được tạo
62
+ :param routing_key: key hoặc tên queue (nếu exchange = '')
63
+ :param body: data push
64
+ :param priority: mức ưu tiên
65
+ :param delivery_mode: ??
66
+ :param exchange: routing
67
+ """
68
+ if priority == -1:
69
+ channel.basic_publish(exchange=exchange, routing_key=routing_key, body=json.dumps(body),
70
+ properties=pika.BasicProperties(delivery_mode=delivery_mode))
71
+ else:
72
+ channel.basic_publish(exchange=exchange, routing_key=routing_key, body=json.dumps(body),
73
+ properties=pika.BasicProperties(delivery_mode=delivery_mode, priority=priority))
74
+ print("push done: ")
75
+
76
+ @staticmethod
77
+ def run_consummer(channel, queue_name, callback_func, is_ack=False):
78
+ """
79
+ run consumer
80
+ :param channel: channel đã được tạo
81
+ :param queue_name: tên queue
82
+ :param callback_func: hàm callback được định nghĩa bởi người dùng
83
+ :return:
84
+ """
85
+ print(" *wait message")
86
+
87
+ def callback(ch, method, properties, body):
88
+ body = json.loads(body.decode("utf-8"))
89
+ if is_ack:
90
+ ch.basic_ack(delivery_tag=method.delivery_tag)
91
+ callback_func(body, properties)
92
+ else:
93
+ callback_func(body, properties)
94
+ ch.basic_ack(delivery_tag=method.delivery_tag)
95
+
96
+
97
+ print("receive done: ")
98
+
99
+ channel.basic_qos(prefetch_count=10)
100
+ channel.basic_consume(queue=queue_name, on_message_callback=callback)
101
+ channel.start_consuming()
102
+
103
+
104
+ if __name__ == '__main__':
105
+ pass
106
+ # host, virtual_host, usr, passw = '10.9.2.220', 'camera-detect', 'tuan.dao', 'lEKUWKXwFt'
107
+ # rab = ICRabbitMQ(host, virtual_host, usr, passw)
108
+ # queue_name = 'test_rb'
109
+ #
110
+ # ## test run producer
111
+ # channel = rab.init_queue(queue_name)
112
+ # body_data = {"2": "3423432423"}
113
+ # ICRabbitMQ.publish_message(channel, queue_name, body_data)
114
+ #
115
+ #
116
+ # ## test run consumer
117
+ # def callback_func(body):
118
+ # """
119
+ # function callback người dùng định nghĩa
120
+ # :param body: message từ queue
121
+ # :return:
122
+ # """
123
+ # print(body)
124
+ #
125
+ #
126
+ # ICRabbitMQ.run_consummer(channel, queue_name, callback_func)