Spaces:
Runtime error
Runtime error
File size: 6,306 Bytes
a5fb347 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os, subprocess, pydriller,json, pandas as pd
import sys
from dotenv import dotenv_values
from Database import Database
class RefactorAnalysis:
def __init__(self,input_path="",output_path=""):
if input_path=="":
self.input_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"data","refactoring-toy-example")
else:
self.input_path=input_path
if output_path=="":
self.output_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"output_ref","output.json")
else:
self.output_path=output_path
def generate_refactor_details(self):
# ref_miner_bin = os.path.join(os.path.dirname(os.path.abspath(__file__)),"executable","RefactoringMiner","bin")
ref_miner_bin = os.path.abspath("executable/RefactoringMiner/bin")
# command = ["cd",ref_miner_bin,"&&","sh","RefactoringMiner","-a",self.input_path,"-json",self.output_path]
command = ["sh","RefactoringMiner","-a",self.input_path,"-json",self.output_path]
try:
os.chdir(ref_miner_bin)
shell_result = subprocess.run(command,capture_output=True,text=True)
shell_result.check_returncode()
# if shell_result!=0:
# raise Exception("Couldn't analyze repository - "+self.input_path+" with RefactorMiner")
# return 0
except subprocess.CalledProcessError as error:
print(error)
sys.exit()
except Exception as e:
print(e)
return 1
def parse_json_output(self):
#TODO
#Filter for Method Refs
with open(self.output_path) as f:
json_output = json.load(f)
dict_output = {}
for obj in json_output["commits"]:
if len(obj["refactorings"])==0:
continue
changes = []
se_lines = []
for ref in obj["refactorings"]:
if not "Method" in ref["type"]:
continue
for parent_refs in ref["leftSideLocations"]:
changes.append(parent_refs["filePath"])
se_lines.append((parent_refs["startLine"],parent_refs["endLine"]))
# list_output.append(dict_output)
dict_output[obj["sha1"]]={
"paths":changes,
"ref_start_end":se_lines,
"ref_type":ref["type"]
}
return dict_output
def create_project_dataframe(self):
df = pd.DataFrame(columns=['commit','refactoring_type','filename','meth_rf_neg','method_refactored'])
parse_output_dict = self.parse_json_output()
commits_to_analyze = list(parse_output_dict.keys())
for commit in pydriller.Repository(self.input_path, only_commits=commits_to_analyze).traverse_commits():
ref_list = parse_output_dict.get(commit.hash)
ref_path_name = list(map(lambda x: str(x).split("/")[len(str(x).split("/"))-1],ref_list["paths"]))
for cf in commit.modified_files:
try:
index_ref = ref_path_name.index(cf.filename)
except ValueError as ve:
continue
if len(cf.changed_methods)==0:
continue
#Diff between methods_changed and methods_before - does methods_changed reduces loop else we have to loop for all methods
for cm in cf.changed_methods:
if cm.start_line<=ref_list["ref_start_end"][index_ref][0] and cm.end_line>=ref_list["ref_start_end"][index_ref][1]:
method_source_code = self.__split_and_extract_methods(cf.source_code_before,cm.start_line,cm.end_line)
method_source_code_neg = self.__split_and_extract_methods(cf.source_code,cm.start_line,cm.end_line)
class_source_code = cf.source_code_before
# df_row = {"commit":commit.hash,"refactoring_type":ref_list["ref_type"],"filename":cf.filename, "meth_rf_neg":class_source_code,"method_refactored":method_source_code}
df_row = {"commit":commit.hash,"refactoring_type":ref_list["ref_type"],"filename":cf.filename, "meth_rf_neg":method_source_code_neg,"method_refactored":method_source_code}
df.loc[len(df)] = df_row
return df
def __split_and_extract_methods(self, source_code,start_line, end_line):
source_code_lines = str(source_code).splitlines()
return "\n".join(source_code_lines[start_line-1:end_line])
def main():
if not os.path.exists("data/repos/"):
try:
print("Starting repo download")
repo_script = subprocess.run(["python","repo_download.py"], capture_output=True, text=True)
repo_script.check_returncode()
except subprocess.CalledProcessError as err:
print(err)
sys.exit(1)
print("Repo Download Completed")
lst_repos = next(os.walk("data/repos/"))[1]
print(len(lst_repos))
cwd = os.path.dirname(os.path.abspath(__file__))
final_df = pd.DataFrame(columns=['commit','refactoring_type','filename','meth_rf_neg','method_refactored'])
database = Database(dotenv_values(".env")['COLLECTION_NAME'])
# database.connect_db()
count=1
batch_size = 5
for idx,repo in enumerate(lst_repos):
os.chdir(cwd)
try:
ref_obj = RefactorAnalysis(os.path.abspath(os.path.join("data/repos",repo)),os.path.abspath(os.path.join("output_ref",repo+".json")))
# ref_miner = ref_obj.generate_refactor_details() #Modify
df = ref_obj.create_project_dataframe()
except Exception as e:
print(e)
continue
final_df = pd.concat([final_df,df], ignore_index=True)
if count==batch_size or idx==len(lst_repos)-1:
print("Inserting into DB", idx)
database.insert_docs(final_df.to_dict(orient="records"))
final_df = pd.DataFrame(columns=['commit','refactoring_type','filename','meth_rf_neg','method_refactored'])
count=1
else:
count+=1
if __name__=="__main__":
main() |