import pandas as pd
import json

with open("data/ccs_synthetic_filtered_large.json") as f:
    d = json.load(f)

df = pd.DataFrame(d)
df["index"] = df.index + 1
df["nr_words"] = df["caption"].apply(lambda x: len(x.split()))

df.to_feather("data/ccs_synthetic.feather")