Spaces:
Sleeping
Sleeping
import json | |
from tqdm import tqdm | |
# ast.literal_eval | |
import ast, re | |
path = 'train.json' | |
with open(path, 'r') as f: | |
data = json.load(f) | |
def tokenize_text(text): | |
return re.findall(r'\w+(?:[-_]\w+)*|\S', text) | |
def extract_entity_spans(entry): | |
text = "" | |
len_start = len("What describes ") | |
len_end = len(" in the text?") | |
entity_types = [] | |
entity_texts = [] | |
for c in entry['conversations']: | |
if c['from'] == 'human' and c['value'].startswith('Text: '): | |
text = c['value'][len('Text: '):] | |
tokenized_text = tokenize_text(text) | |
if c['from'] == 'human' and c['value'].startswith('What describes '): | |
c_type = c['value'][len_start:-len_end] | |
c_type = c_type.replace(' ', '_') | |
entity_types.append(c_type) | |
elif c['from'] == 'gpt' and c['value'].startswith('['): | |
if c['value'] == '[]': | |
entity_types = entity_types[:-1] | |
continue | |
texts_ents = ast.literal_eval(c['value']) | |
# replace space to _ in texts_ents | |
entity_texts.extend(texts_ents) | |
num_repeat = len(texts_ents) - 1 | |
entity_types.extend([entity_types[-1]] * num_repeat) | |
entity_spans = [] | |
for j, entity_text in enumerate(entity_texts): | |
entity_tokens = tokenize_text(entity_text) | |
matches = [] | |
for i in range(len(tokenized_text) - len(entity_tokens) + 1): | |
if " ".join(tokenized_text[i:i + len(entity_tokens)]).lower() == " ".join(entity_tokens).lower(): | |
matches.append((i, i + len(entity_tokens) - 1, entity_types[j])) | |
if matches: | |
entity_spans.extend(matches) | |
return entity_spans, tokenized_text | |
# Usage: | |
# Replace 'entry' with the specific entry from your JSON data | |
entry = data[17818] # For example, taking the first entry | |
entity_spans, tokenized_text = extract_entity_spans(entry) | |
print("Entity Spans:", entity_spans) | |
#print("Tokenized Text:", tokenized_text) | |
# create a dict: {"tokenized_text": tokenized_text, "entity_spans": entity_spans} | |
all_data = [] | |
for entry in tqdm(data): | |
entity_spans, tokenized_text = extract_entity_spans(entry) | |
all_data.append({"tokenized_text": tokenized_text, "ner": entity_spans}) | |
with open('train_instruct.json', 'w') as f: | |
json.dump(all_data, f) | |