Spaces:
Build error
Build error
import random | |
import datetime | |
import pandas as pd | |
random.seed(1996) | |
DEV_RATIO = 0.10 | |
def choose_best_casing(orig, predicted): | |
num_upper_tokens = len([c == c.upper() for c in orig.upper()]) | |
if num_upper_tokens > 0.5 * len(orig): | |
return predicted | |
return predicted | |
def split_data(): | |
events_main = [] | |
texts_main = [] | |
events_dev = [] | |
texts_dev = [] | |
with open("data/migration/corpus_titoli_all_raw.truecase_bilstm.txt", encoding="utf-8") as f: | |
titles_tc = [line.strip() for line in f] | |
df_all = pd.read_csv("data/migration/corpus_all.csv", encoding="latin-1") | |
for idx, (_, row) in enumerate(df_all.iterrows()): | |
if idx % 1000 == 0: | |
print("Processing line:", idx) | |
year = int(row["Anno"]) | |
event_data = { | |
"event:id": idx, | |
"event:year": year, | |
} | |
text_data = { | |
"event_id": idx, | |
"text_id": idx, | |
"pubyear": year, | |
"language": "Italian", | |
"provider": row["Testata"].lstrip("*T_"), | |
"title": choose_best_casing(row["Titolo"], titles_tc[idx]), | |
"title_truecased": titles_tc[idx], | |
"title_orig": row["Titolo"] | |
} | |
if random.random() < DEV_RATIO: | |
events_dev.append(event_data) | |
texts_dev.append(text_data) | |
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out: | |
f_out.write(text_data["title"]) | |
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out: | |
f_out.write(text_data["title_orig"]) | |
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out: | |
f_out.write(text_data["title_truecased"]) | |
else: | |
events_main.append(event_data) | |
texts_main.append(text_data) | |
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out: | |
f_out.write(text_data["title"]) | |
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out: | |
f_out.write(text_data["title_orig"]) | |
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out: | |
f_out.write(text_data["title_truecased"]) | |
pd.DataFrame(events_main).to_csv("output/migration/split_data/split_main.events.csv") | |
pd.DataFrame(texts_main).to_csv("output/migration/split_data/split_main.texts.meta.csv") | |
pd.DataFrame(events_dev).to_csv("output/migration/split_data/split_dev10.events.csv") | |
pd.DataFrame(texts_dev).to_csv("output/migration/split_data/split_dev10.texts.meta.csv") | |
if __name__ == "__main__": | |
split_data() |