chenxwh commited on
Commit
a106f67
·
verified ·
1 Parent(s): caff402

Upload veracity_with_scraped_text.py

Browse files
src/prediction/veracity_with_scraped_text.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ from tqdm import tqdm
5
+
6
+
7
+ def load_url_text_map(knowledge_store_dir, claim_id):
8
+ url_text_map = {}
9
+ knowledge_file = os.path.join(knowledge_store_dir, f"{claim_id}.json")
10
+
11
+ if os.path.exists(knowledge_file):
12
+ with open(knowledge_file, "r") as f:
13
+ for line in f:
14
+ data = json.loads(line)
15
+ url = data["url"]
16
+ url2text = data["url2text"]
17
+ concatenated_text = " ".join(url2text)
18
+ url_text_map[url] = concatenated_text
19
+
20
+ return url_text_map
21
+
22
+
23
+ if __name__ == "__main__":
24
+ parser = argparse.ArgumentParser(
25
+ description="Add scraped_text field to the prediction file."
26
+ )
27
+ parser.add_argument(
28
+ "-i",
29
+ "--veracity_prediction_file",
30
+ default="data_store/dev_veracity_prediction.json",
31
+ help="Json file with the veracity predictions.",
32
+ )
33
+ parser.add_argument(
34
+ "-o",
35
+ "--output_file",
36
+ default="data_store/dev_veracity_prediction_for_submission.json",
37
+ help="Json file with the veracity predictions and the scraped_text.",
38
+ )
39
+ parser.add_argument(
40
+ "--knowledge_store_dir",
41
+ type=str,
42
+ help="Directory of json files of the knowledge store containing url2text.",
43
+ )
44
+ args = parser.parse_args()
45
+
46
+ predictions = []
47
+ with open(args.veracity_prediction_file) as f:
48
+ predictions = json.load(f)
49
+
50
+ for claim in tqdm(predictions, desc="Processing claims"):
51
+ claim_id = claim["claim_id"]
52
+ url_text_map = load_url_text_map(args.knowledge_store_dir, claim_id)
53
+
54
+ # Process each evidence in the claim and add scraped_text
55
+ for evidence in claim["evidence"]:
56
+ url = evidence["url"]
57
+ scraped_text = url_text_map.get(url)
58
+ if scraped_text:
59
+ evidence["scraped_text"] = scraped_text
60
+ else:
61
+ print(
62
+ f"Warning: No scraped text found for claim_id {claim_id} and url {url}"
63
+ )
64
+
65
+ with open(args.output_file, "w", encoding="utf-8") as output_file:
66
+ json.dump(predictions, output_file, ensure_ascii=False, indent=4)
67
+
68
+ print(f"Updated JSON saved to {args.output_file}")