import json import os # os.environ['CUDA_VISIBLE_DEVICES'] = '-1' from function.topic_clustering import model, AgglomerativeClustering def check_duplicate_title_domain(docs): lst_title_domain = [f"{d.get('domain', '')} {d.get('title','')}" for d in docs] for i in range(1,len(lst_title_domain) -1): for j in range(i+1,len(lst_title_domain)): if lst_title_domain[j] == lst_title_domain[i]: lst_title_domain[j] = 'dup' lst_filter_docs = [docs[i] for i,x in enumerate(lst_title_domain) if x != 'dup'] return lst_filter_docs def main(req): # threshold = req.get('threshold', 0.3) type = req['type'] if type == 'monthly': MAX_CLUSTER = 50 else: MAX_CLUSTER = 20 MAX_NUM_DOC_PER_CLUSTER = 50 threshold = 0.4 preprocess = req.get('preprocess', []) lst_labels = [] lst_topics = [] for date_clusters in preprocess: # date = date_clusters['date'] topic = date_clusters.get('topic', []) if topic: for topic_id in topic: # print(topic_id) topic_docs = topic[topic_id] lst_topics.append(topic[topic_id]) label = '. '.join([topic_docs[0].get('title',''),topic_docs[0].get('snippet','')]) lst_labels.append(label) final_clusters = [] label_clusters = sbert_clustering(lst_labels, distance_threshold=threshold,return_ids=True) # print(lst_labels) print(label_clusters) if label_clusters: for id_label_clusters in label_clusters: merge_clusters = [] num_docs = 0 for topic_id in label_clusters[id_label_clusters]: topic = lst_topics[topic_id] count_doc = topic[0].get('num_docs',1) num_docs += count_doc merge_clusters.extend(lst_topics[topic_id]) merge_clusters = sorted(merge_clusters, key=lambda x: -x.get('created_time',0)) merge_clusters = check_duplicate_title_domain(merge_clusters) merge_clusters = merge_clusters[:MAX_NUM_DOC_PER_CLUSTER] for doc in merge_clusters: doc['num_docs'] = num_docs final_clusters.append(merge_clusters) final_clusters = sorted(final_clusters, key=lambda x: -x[0]['num_docs']) final_clusters = final_clusters[:MAX_CLUSTER] final_result = {} for i,cluster in enumerate(final_clusters): final_result[i] = cluster with open('zzz.json','w') as f: json.dump(final_result, f, ensure_ascii=False) return final_result def get_sbert_embedding(lst_sentence): embs = model.encode(lst_sentence) # embs = np.array(embs) return embs def sbert_clustering(lst_sentence, distance_threshold=0.25, return_ids = False): lst_sentence = [sen.replace('_',' ') for sen in lst_sentence] if len(lst_sentence) == 0: return if len(lst_sentence) == 1: if return_ids: return { 0: [0] } return { 0: lst_sentence } # embs = model.encode(lst_sentence, show_progress_bar=True) embs = get_sbert_embedding(lst_sentence) hyer_clusteror = AgglomerativeClustering(n_clusters = None,compute_full_tree = True, affinity = 'cosine', linkage = 'complete', distance_threshold=distance_threshold) # print(f'[INFO] Len lst_sentence: {len(lst_sentence)}') # print(f'[INFO] Len embs: {len(embs)}') hyer_clusteror.fit(embs) # print(hyer_clusteror.n_clusters_) dict_result = {} dict_ids = {} for i in range(hyer_clusteror.n_clusters_): if i not in dict_result: dict_result[i] = [] dict_ids[i] = [] for j in range(len(lst_sentence)): if hyer_clusteror.labels_[j] == i: dict_result[i].append(lst_sentence[j]) dict_ids[i].append(j) if return_ids: output = dict_ids else: output = dict_result result = dict(sorted(output.items(), key=lambda i: -len(i[1]))) return result if __name__ == '__main__': with open("input_merge.json",'r') as f: req = json.load(f) main(req)