ThornRugal commited on
Commit
e5451b9
1 Parent(s): 32cabe6

Add application files and updated Readme

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. README.md +16 -13
  3. app.py +170 -0
  4. data_Excel_format.xlsx +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data_Excel_format.xlsx filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,16 @@
1
- ---
2
- title: ChinesePrivacyPolicyMark
3
- emoji: 👁
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.5.0
8
- app_file: app.py
9
- pinned: false
10
- short_description: Mark Chinese Privacy Policy with Retrieve models
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
+ ---
2
+ title: ChinesePrivacyPolicyMark
3
+ emoji: 👁
4
+ colorFrom: gray
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.5.0
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: Mark Chinese Privacy Policy with Retrieve models
11
+ ---
12
+ 使用的数据地址:https://github.com/EnlightenedAI/CAPP-130<br>
13
+ 使用预训练好的模型检索预先保存好的隐私政策,以此标注隐私政策中的关键信息。<br>
14
+ 首先使用特征提取模型将隐私政策中的句子进行tokenize,将其与保存的向量对比进行一次“粗筛”,选取与其最为接近的n条记录。<br>
15
+ 之后使用文本相似度计算模型,将筛选出来的n条记录与原本的文本进行匹配,过滤出相似度高于阈值p的m条记录,将这m条记录所属的标记合并起来。<br>
16
+ 由于没有使用GPU,直接在Space中运行会很慢。有条件可以clone下来试试。
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import warnings
3
+ warnings.filterwarnings("ignore")
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ import faiss
8
+ import ast
9
+
10
+ import torch.nn.functional as F
11
+ import torch
12
+ from transformers import AutoModel, AutoTokenizer
13
+
14
+ Encoding_model = 'jinaai/jina-embeddings-v2-base-zh'
15
+ model = AutoModel.from_pretrained(Encoding_model, trust_remote_code=True, torch_dtype=torch.bfloat16)
16
+ model#.to("cuda")
17
+
18
+ similarity_model = 'Alibaba-NLP/gte-multilingual-base'
19
+ similarity_tokenizer = AutoTokenizer.from_pretrained(similarity_model)
20
+ similarity_model = AutoModel.from_pretrained(similarity_model, trust_remote_code=True)#.to("cuda")
21
+
22
+ def get_not_empty_data(df,x_column="text",y_column="label"):
23
+ df = df[df[y_column] != "[]"].reset_index(drop=True)
24
+ res_dict = {}
25
+ for idx in df.index:
26
+ if df.loc[idx,x_column] not in res_dict:
27
+ res_dict[df.loc[idx,x_column]] = ast.literal_eval(df.loc[idx,y_column])
28
+ else:
29
+ res_dict[df.loc[idx,x_column]] += ast.literal_eval(df.loc[idx,y_column])
30
+ res_dict = {k:list(set(v)) for k,v in res_dict.items()}
31
+ df_dict = pd.DataFrame({"x":res_dict.keys(),"y":res_dict.values()})
32
+ return df_dict
33
+
34
+ data_all = pd.read_excel("data_Excel_format.xlsx")
35
+ df_dict_all = get_not_empty_data(data_all)
36
+ x_dict = df_dict_all["x"].values
37
+ y_dict = df_dict_all["y"].values
38
+
39
+ def calc_scores(x):
40
+ return (x[:1] @ x[1:].T)
41
+
42
+ def get_idxs(threshold,max_len,arr):
43
+ res = np.where(arr >= threshold)[0]
44
+ if len(res)<max_len:
45
+ return res
46
+ res = res[np.argsort(-arr[res])][:3]
47
+ return res
48
+
49
+ def merge_set_to_list(set_list):
50
+ res = set()
51
+ for i in set_list:
52
+ res = res | i
53
+ return res
54
+
55
+
56
+ def get_predict_result(index,score,threshold,max_len):
57
+ score = score.flatten()
58
+ index = index.flatten()
59
+ index_of_index = np.where(score >= threshold)[0]
60
+ if len(index_of_index)>=max_len:
61
+ index_of_index = index_of_index[np.argsort(-index[index_of_index])][:3]
62
+ if len(index_of_index)==0:
63
+ return {},[]
64
+ res_index = index[index_of_index]
65
+ res = merge_set_to_list([set(i) for i in y_dict[res_index]])
66
+ return res,x_dict[res_index]
67
+
68
+ vec = np.empty(shape=[0,768],dtype="float32")
69
+ bsize = 256
70
+ with torch.no_grad():
71
+ for i in range(0,len(x),bsize):
72
+ tmp = model.encode(x[i:i+bsize])
73
+ vec = np.concatenate([vec,tmp])
74
+
75
+
76
+ index = faiss.IndexFlatIP(768)
77
+ faiss.normalize_L2(vec)
78
+ index.add(vec)
79
+ faiss.write_index(index,"all_index.faiss")
80
+ index = faiss.read_index("all_index.faiss")
81
+
82
+ def predict_label(x,threshold=0.85,n_nearest=10,max_result_len=3):
83
+ bsize=1
84
+ y_pred = []
85
+ with torch.no_grad():
86
+ for i in range(0,len(x),bsize):
87
+ sentences = x[i:i+bsize]
88
+ vec = model.encode(sentences)
89
+ faiss.normalize_L2(vec)
90
+ scores, indexes = index.search(vec,n_nearest)
91
+ x_pred = np.array([[sentences[j]]+s.tolist() for j,s in enumerate(x_dict[indexes])])
92
+ batch_dict = similarity_tokenizer(x_pred.flatten().tolist(), max_length=768, padding=True, truncation=True, return_tensors='pt')#.to("cuda")
93
+ outputs = similarity_model(**batch_dict)
94
+ dimension=768
95
+ embeddings = outputs.last_hidden_state[:, 0][:dimension]
96
+ embeddings = F.normalize(embeddings, p=2, dim=1)
97
+ embeddings = embeddings.view(len(x_pred),n_nearest+1,dimension).detach().cpu().numpy()
98
+ scores = [calc_scores(embeddings[b]) for b in range(embeddings.shape[0])]
99
+
100
+ pred = [get_predict_result(indexes[k],scores[k],threshold=threshold,max_len=max_result_len) for k in range(len(scores))]
101
+ y_pred.append([i[0] for i in pred])
102
+ return y_pred
103
+
104
+ CSS_Content = """
105
+ <!DOCTYPE html>
106
+ <html lang="en">
107
+ <head>
108
+ <meta charset="UTF-8">
109
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
110
+ <style>
111
+ #custom_id {
112
+ border: 2px solid red;
113
+ padding: 10px;
114
+ background-color: lightgray;
115
+ }
116
+ </style>
117
+ </head>
118
+ </html>
119
+ <span style="color: red;line-height:1;">红色字体:潜在风险</span><br>
120
+ <span style="color: blue;line-height:1;">蓝色字体:权限获取</span><br>
121
+ <span style="color: purple;line-height:1;">紫色字体:数据收集</span><br>
122
+ <span style="color: green;line-height:1;">绿色字体:数据、权限管理</span><br>
123
+ <span style="color: brown;line-height:1;">棕色字体:共享、委托、转让、公开(披露)</span><br>
124
+ """
125
+
126
+ color_dict = {"潜在风险":"red",
127
+ "权限获取":"blue",
128
+ "数据收集":"purple",
129
+ "数据、权限管理":"green",
130
+ "共享、委托、转让、公开(披露)":"brown"
131
+ }
132
+
133
+
134
+
135
+ def generate_HTML(text,threshold=0.85,n_nearest=10,max_result_len=3):
136
+ sentences = text.split("\n")
137
+ sentences = [i for i in map(lambda x:x.split("。"),sentences)]
138
+ res = CSS_Content
139
+ for paragraph in sentences:
140
+ tmp_res = []
141
+ pred_label = predict_label(paragraph,threshold,n_nearest,max_result_len)
142
+ for i,x in enumerate(pred_label):
143
+ pre = "<span"
144
+ if len(x[0])>0:
145
+ for j in color_dict.keys(): #color dict重要性递减,所以只取第一个标签的颜色
146
+ if j in x[0]:
147
+ pre += f' style="color: {color_dict[j]};line-height:1;"'
148
+ break
149
+ tmp_res.append(pre+">"+paragraph[i]+"</span>")
150
+ res += "。".join(tmp_res)
151
+ res += "<br>"
152
+ return res
153
+
154
+ with gr.Blocks() as demo:
155
+ with gr.Row():
156
+ input_text = gr.Textbox(lines=25,label="输入")
157
+
158
+ with gr.Row():
159
+ threshold = gr.Slider(minimum=0.5,maximum=0.85,value=0.75,step=0.05,interactive=True,label="相似度阈值")
160
+ n_nearest = gr.Slider(minimum=3,maximum=10,value=10,step=1,interactive=True,label="粗筛语句数量")
161
+ max_result_len = gr.Slider(minimum=1,maximum=5,value=3,step=1,interactive=True,label="精筛语句数量")
162
+ with gr.Row():
163
+ submit_button = gr.Button("检测")
164
+ with gr.Row():
165
+ output_text = gr.HTML(CSS_Content)
166
+ output_text.elem_id="custom_id"
167
+
168
+ submit_button.click(fn=generate_HTML, inputs=[input_text,threshold,n_nearest,max_result_len], outputs=output_text)
169
+
170
+ demo.launch()
data_Excel_format.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db4b6d314555c48bc00053a4e581960e1991625d7962f3b88e00dd04c3233a6b
3
+ size 2846032