Spaces:
No application file
No application file
import json | |
import glob | |
def extract_arabic_data(input_files,output_file): | |
arabic_data=[] | |
for input_file in input_files: | |
#load json data from the input file | |
with open(input_file,encoding="utf-8") as json_file: | |
json_data=json.load(json_file) | |
#extract only the arabic data | |
for entry in json_data: | |
if "lines" in entry: | |
for line in entry["lines"]: | |
text=line.get("text","") | |
arabic_data.append({"Arabic text":text}) | |
with open(output_file,mode="w",encoding="utf-8") as json_output_file: | |
json.dump(arabic_data,json_output_file,ensure_ascii=False,indent=4) | |
print("Arabic data from",len(input_files),"json files has been extracted in :",output_file) | |
if __name__ =="__main__": | |
input_files=glob.glob("cache/GB/ocr_output*.json") | |
output_json_file="cache/output/basic_info_frame.json" | |
extract_arabic_data(input_files,output_json_file) |