|
import json |
|
import os |
|
|
|
for language in ['ruby','go','java','javascript','php','python']: |
|
print(language) |
|
train,valid,test,codebase=[],[],[], [] |
|
for root, dirs, files in os.walk(language+'/final'): |
|
for file in files: |
|
temp=os.path.join(root,file) |
|
if '.jsonl' in temp: |
|
if 'train' in temp: |
|
train.append(temp) |
|
elif 'valid' in temp: |
|
valid.append(temp) |
|
codebase.append(temp) |
|
elif 'test' in temp: |
|
test.append(temp) |
|
codebase.append(temp) |
|
|
|
train_data,valid_data,test_data,codebase_data={},{},{},{} |
|
for files,data in [[train,train_data],[valid,valid_data],[test,test_data],[codebase,codebase_data]]: |
|
for file in files: |
|
if '.gz' in file: |
|
os.system("gzip -d {}".format(file)) |
|
file=file.replace('.gz','') |
|
with open(file) as f: |
|
for line in f: |
|
line=line.strip() |
|
js=json.loads(line) |
|
data[js['url']]=js |
|
|
|
with open('{}/codebase.jsonl'.format(language),'w') as f3: |
|
for tag,data in [['train',train_data],['valid',valid_data],['test',test_data],['test',test_data],['codebase',codebase_data]]: |
|
with open('{}/{}.jsonl'.format(language,tag),'w') as f1, open("{}/{}.txt".format(language,tag)) as f2: |
|
for line in f2: |
|
line=line.strip() |
|
if line in data: |
|
js=data[line] |
|
if tag in ['valid','test']: |
|
js['original_string']='' |
|
js['code']='' |
|
js['code_tokens']=[] |
|
if tag=='codebase': |
|
js['docstring']='' |
|
js['docstring_tokens']=[] |
|
f1.write(json.dumps(js)+'\n') |
|
|
|
|