|
import json |
|
from tqdm import tqdm |
|
|
|
import ast, re |
|
|
|
path = 'train.json' |
|
|
|
with open(path, 'r') as f: |
|
data = json.load(f) |
|
|
|
def tokenize_text(text): |
|
return re.findall(r'\w+(?:[-_]\w+)*|\S', text) |
|
|
|
def extract_entity_spans(entry): |
|
text = "" |
|
len_start = len("What describes ") |
|
len_end = len(" in the text?") |
|
entity_types = [] |
|
entity_texts = [] |
|
|
|
for c in entry['conversations']: |
|
if c['from'] == 'human' and c['value'].startswith('Text: '): |
|
text = c['value'][len('Text: '):] |
|
tokenized_text = tokenize_text(text) |
|
|
|
if c['from'] == 'human' and c['value'].startswith('What describes '): |
|
|
|
c_type = c['value'][len_start:-len_end] |
|
c_type = c_type.replace(' ', '_') |
|
entity_types.append(c_type) |
|
|
|
elif c['from'] == 'gpt' and c['value'].startswith('['): |
|
if c['value'] == '[]': |
|
entity_types = entity_types[:-1] |
|
continue |
|
|
|
texts_ents = ast.literal_eval(c['value']) |
|
|
|
entity_texts.extend(texts_ents) |
|
num_repeat = len(texts_ents) - 1 |
|
entity_types.extend([entity_types[-1]] * num_repeat) |
|
|
|
entity_spans = [] |
|
for j, entity_text in enumerate(entity_texts): |
|
entity_tokens = tokenize_text(entity_text) |
|
matches = [] |
|
for i in range(len(tokenized_text) - len(entity_tokens) + 1): |
|
if " ".join(tokenized_text[i:i + len(entity_tokens)]).lower() == " ".join(entity_tokens).lower(): |
|
matches.append((i, i + len(entity_tokens) - 1, entity_types[j])) |
|
if matches: |
|
entity_spans.extend(matches) |
|
|
|
return entity_spans, tokenized_text |
|
|
|
|
|
|
|
entry = data[17818] |
|
entity_spans, tokenized_text = extract_entity_spans(entry) |
|
print("Entity Spans:", entity_spans) |
|
|
|
|
|
|
|
|
|
all_data = [] |
|
|
|
for entry in tqdm(data): |
|
entity_spans, tokenized_text = extract_entity_spans(entry) |
|
all_data.append({"tokenized_text": tokenized_text, "ner": entity_spans}) |
|
|
|
|
|
with open('train_instruct.json', 'w') as f: |
|
json.dump(all_data, f) |
|
|
|
|