cocosoda_ruby / dataset /preprocess.py
SalazarPevelll
model
51c57f8
import json
import os
for language in ['ruby','go','java','javascript','php','python']:
print(language)
train,valid,test,codebase=[],[],[], []
for root, dirs, files in os.walk(language+'/final'):
for file in files:
temp=os.path.join(root,file)
if '.jsonl' in temp:
if 'train' in temp:
train.append(temp)
elif 'valid' in temp:
valid.append(temp)
codebase.append(temp)
elif 'test' in temp:
test.append(temp)
codebase.append(temp)
train_data,valid_data,test_data,codebase_data={},{},{},{}
for files,data in [[train,train_data],[valid,valid_data],[test,test_data],[codebase,codebase_data]]:
for file in files:
if '.gz' in file:
os.system("gzip -d {}".format(file))
file=file.replace('.gz','')
with open(file) as f:
for line in f:
line=line.strip()
js=json.loads(line)
data[js['url']]=js
with open('{}/codebase.jsonl'.format(language),'w') as f3:
for tag,data in [['train',train_data],['valid',valid_data],['test',test_data],['test',test_data],['codebase',codebase_data]]:
with open('{}/{}.jsonl'.format(language,tag),'w') as f1, open("{}/{}.txt".format(language,tag)) as f2:
for line in f2:
line=line.strip()
if line in data:
js=data[line]
if tag in ['valid','test']:
js['original_string']=''
js['code']=''
js['code_tokens']=[]
if tag=='codebase':
js['docstring']=''
js['docstring_tokens']=[]
f1.write(json.dumps(js)+'\n')