File size: 2,080 Bytes
51c57f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import json
import os

for language in ['ruby','go','java','javascript','php','python']:
    print(language)
    train,valid,test,codebase=[],[],[], []
    for root, dirs, files in os.walk(language+'/final'):
        for file in files:
            temp=os.path.join(root,file)
            if '.jsonl' in temp:
                if 'train' in temp:
                    train.append(temp)
                elif 'valid' in temp:
                    valid.append(temp)
                    codebase.append(temp)
                elif 'test' in temp:
                    test.append(temp) 
                    codebase.append(temp)
                    
    train_data,valid_data,test_data,codebase_data={},{},{},{}
    for files,data in [[train,train_data],[valid,valid_data],[test,test_data],[codebase,codebase_data]]:
            for file in files:
                if '.gz' in file:
                    os.system("gzip -d {}".format(file))
                    file=file.replace('.gz','')
                with open(file) as f:
                    for line in f:
                        line=line.strip()
                        js=json.loads(line)
                        data[js['url']]=js
                        
    with open('{}/codebase.jsonl'.format(language),'w') as f3:
        for tag,data in [['train',train_data],['valid',valid_data],['test',test_data],['test',test_data],['codebase',codebase_data]]:
            with open('{}/{}.jsonl'.format(language,tag),'w') as f1, open("{}/{}.txt".format(language,tag)) as f2:
                for line in f2:
                    line=line.strip()
                    if line in data:
                        js=data[line]
                        if tag in ['valid','test']:
                            js['original_string']=''
                            js['code']=''
                            js['code_tokens']=[]
                        if tag=='codebase':
                            js['docstring']=''
                            js['docstring_tokens']=[]
                        f1.write(json.dumps(js)+'\n')