ImageDataExtractor2

Sleeping

App Files Files Community

ImageDataExtractor2 / backup /modules /data_proc.py

WebashalarForML

Upload 7 files

fcd0a70 verified 3 months ago

raw

history blame

2.35 kB

	import json
	from tqdm import tqdm
	# ast.literal_eval
	import ast, re

	path = 'train.json'

	with open(path, 'r') as f:
	data = json.load(f)

	def tokenize_text(text):
	return re.findall(r'\w+(?:[-_]\w+)*\|\S', text)

	def extract_entity_spans(entry):
	text = ""
	len_start = len("What describes ")
	len_end = len(" in the text?")
	entity_types = []
	entity_texts = []

	for c in entry['conversations']:
	if c['from'] == 'human' and c['value'].startswith('Text: '):
	text = c['value'][len('Text: '):]
	tokenized_text = tokenize_text(text)

	if c['from'] == 'human' and c['value'].startswith('What describes '):

	c_type = c['value'][len_start:-len_end]
	c_type = c_type.replace(' ', '_')
	entity_types.append(c_type)

	elif c['from'] == 'gpt' and c['value'].startswith('['):
	if c['value'] == '[]':
	entity_types = entity_types[:-1]
	continue

	texts_ents = ast.literal_eval(c['value'])
	# replace space to _ in texts_ents
	entity_texts.extend(texts_ents)
	num_repeat = len(texts_ents) - 1
	entity_types.extend([entity_types[-1]] * num_repeat)

	entity_spans = []
	for j, entity_text in enumerate(entity_texts):
	entity_tokens = tokenize_text(entity_text)
	matches = []
	for i in range(len(tokenized_text) - len(entity_tokens) + 1):
	if " ".join(tokenized_text[i:i + len(entity_tokens)]).lower() == " ".join(entity_tokens).lower():
	matches.append((i, i + len(entity_tokens) - 1, entity_types[j]))
	if matches:
	entity_spans.extend(matches)

	return entity_spans, tokenized_text

	# Usage:
	# Replace 'entry' with the specific entry from your JSON data
	entry = data[17818] # For example, taking the first entry
	entity_spans, tokenized_text = extract_entity_spans(entry)
	print("Entity Spans:", entity_spans)
	#print("Tokenized Text:", tokenized_text)

	# create a dict: {"tokenized_text": tokenized_text, "entity_spans": entity_spans}

	all_data = []

	for entry in tqdm(data):
	entity_spans, tokenized_text = extract_entity_spans(entry)
	all_data.append({"tokenized_text": tokenized_text, "ner": entity_spans})


	with open('train_instruct.json', 'w') as f:
	json.dump(all_data, f)