File size: 2,350 Bytes
fcd0a70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import json
from tqdm import tqdm
# ast.literal_eval
import ast, re
path = 'train.json'
with open(path, 'r') as f:
data = json.load(f)
def tokenize_text(text):
return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
def extract_entity_spans(entry):
text = ""
len_start = len("What describes ")
len_end = len(" in the text?")
entity_types = []
entity_texts = []
for c in entry['conversations']:
if c['from'] == 'human' and c['value'].startswith('Text: '):
text = c['value'][len('Text: '):]
tokenized_text = tokenize_text(text)
if c['from'] == 'human' and c['value'].startswith('What describes '):
c_type = c['value'][len_start:-len_end]
c_type = c_type.replace(' ', '_')
entity_types.append(c_type)
elif c['from'] == 'gpt' and c['value'].startswith('['):
if c['value'] == '[]':
entity_types = entity_types[:-1]
continue
texts_ents = ast.literal_eval(c['value'])
# replace space to _ in texts_ents
entity_texts.extend(texts_ents)
num_repeat = len(texts_ents) - 1
entity_types.extend([entity_types[-1]] * num_repeat)
entity_spans = []
for j, entity_text in enumerate(entity_texts):
entity_tokens = tokenize_text(entity_text)
matches = []
for i in range(len(tokenized_text) - len(entity_tokens) + 1):
if " ".join(tokenized_text[i:i + len(entity_tokens)]).lower() == " ".join(entity_tokens).lower():
matches.append((i, i + len(entity_tokens) - 1, entity_types[j]))
if matches:
entity_spans.extend(matches)
return entity_spans, tokenized_text
# Usage:
# Replace 'entry' with the specific entry from your JSON data
entry = data[17818] # For example, taking the first entry
entity_spans, tokenized_text = extract_entity_spans(entry)
print("Entity Spans:", entity_spans)
#print("Tokenized Text:", tokenized_text)
# create a dict: {"tokenized_text": tokenized_text, "entity_spans": entity_spans}
all_data = []
for entry in tqdm(data):
entity_spans, tokenized_text = extract_entity_spans(entry)
all_data.append({"tokenized_text": tokenized_text, "ner": entity_spans})
with open('train_instruct.json', 'w') as f:
json.dump(all_data, f)
|