Henry65 commited on
Commit
f514f40
1 Parent(s): 1911ed5

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +3 -0
  2. RepoPipeline.py +214 -0
  3. config.json +47 -0
  4. merges.txt +0 -0
  5. special_tokens_map.json +51 -0
  6. tokenizer.json +0 -0
  7. tokenizer_config.json +64 -0
  8. vocab.json +0 -0
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
RepoPipeline.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any, List
2
+
3
+ import ast
4
+ import tarfile
5
+ from ast import AsyncFunctionDef, ClassDef, FunctionDef, Module
6
+ import torch
7
+ import requests
8
+ from transformers import Pipeline
9
+ from tqdm.auto import tqdm
10
+
11
+
12
+ def extract_code_and_docs(text: str):
13
+ code_set = set()
14
+ docs_set = set()
15
+ root = ast.parse(text)
16
+ for node in ast.walk(root):
17
+ if not isinstance(node, (AsyncFunctionDef, FunctionDef, ClassDef, Module)):
18
+ continue
19
+ docs = ast.get_docstring(node)
20
+ node_without_docs = node
21
+ if docs is not None:
22
+ docs_set.add(docs)
23
+ # Remove docstrings from the node
24
+ node_without_docs.body = node_without_docs.body[1:]
25
+ if isinstance(node, (AsyncFunctionDef, FunctionDef)):
26
+ code_set.add(ast.unparse(node_without_docs))
27
+
28
+ return code_set, docs_set
29
+
30
+
31
+ def get_metadata(repo_name, headers=None):
32
+ api_url = f"https://api.github.com/repos/{repo_name}"
33
+ tqdm.write(f"[+] Getting metadata for {repo_name}")
34
+ try:
35
+ response = requests.get(api_url, headers=headers)
36
+ response.raise_for_status()
37
+ return response.json()
38
+ except requests.exceptions.HTTPError as e:
39
+ tqdm.write(f"[-] Failed to retrieve metadata from {repo_name}: {e}")
40
+ return {}
41
+
42
+
43
+ def extract_information(repos, headers=None):
44
+ extracted_infos = []
45
+ for repo_name in tqdm(repos, disable=len(repos) <= 1):
46
+ # Get metadata
47
+ metadata = get_metadata(repo_name, headers=headers)
48
+ repo_info = {
49
+ "name": repo_name,
50
+ "codes": set(),
51
+ "docs": set(),
52
+ "requirements": set(),
53
+ "readmes": set(),
54
+ "topics": [],
55
+ "license": "",
56
+ "stars": metadata.get("stargazers_count"),
57
+ }
58
+ if metadata.get("topics"):
59
+ repo_info["topics"] = metadata["topics"]
60
+ if metadata.get("license"):
61
+ repo_info["license"] = metadata["license"]["spdx_id"]
62
+
63
+ # Download repo tarball bytes
64
+ download_url = f"https://api.github.com/repos/{repo_name}/tarball"
65
+ tqdm.write(f"[+] Downloading {repo_name}")
66
+ try:
67
+ response = requests.get(download_url, headers=headers, stream=True)
68
+ response.raise_for_status()
69
+ except requests.exceptions.HTTPError as e:
70
+ tqdm.write(f"[-] Failed to download {repo_name}: {e}")
71
+ continue
72
+
73
+ # Extract python files and parse them
74
+ tqdm.write(f"[+] Extracting {repo_name} info")
75
+ with tarfile.open(fileobj=response.raw, mode="r|gz") as tar:
76
+ for member in tar:
77
+ if (member.name.endswith(".py") and member.isfile()) is False:
78
+ continue
79
+ try:
80
+ file_content = tar.extractfile(member).read().decode("utf-8")
81
+ code_set, docs_set = extract_code_and_docs(file_content)
82
+
83
+ repo_info["codes"].update(code_set)
84
+ repo_info["docs"].update(docs_set)
85
+ except UnicodeDecodeError as e:
86
+ tqdm.write(
87
+ f"[-] UnicodeDecodeError in {member.name}, skipping: \n{e}"
88
+ )
89
+ except SyntaxError as e:
90
+ tqdm.write(f"[-] SyntaxError in {member.name}, skipping: \n{e}")
91
+
92
+ extracted_infos.append(repo_info)
93
+
94
+ return extracted_infos
95
+
96
+
97
+ class RepoPipeline(Pipeline):
98
+
99
+ def __init__(self, github_token=None, *args, **kwargs):
100
+ super().__init__(*args, **kwargs)
101
+
102
+ # Github token
103
+ self.github_token = github_token
104
+ if self.github_token:
105
+ print("[+] GitHub token set!")
106
+ else:
107
+ print(
108
+ "[*] Please set GitHub token to avoid unexpected errors. \n"
109
+ "For more info, see: "
110
+ "https://docs.github.com/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token"
111
+ )
112
+
113
+ def _sanitize_parameters(self, **pipeline_parameters):
114
+ preprocess_parameters = {}
115
+ if "github_token" in pipeline_parameters:
116
+ preprocess_parameters["github_token"] = pipeline_parameters["github_token"]
117
+
118
+ forward_parameters = {}
119
+ if "max_length" in pipeline_parameters:
120
+ forward_parameters["max_length"] = pipeline_parameters["max_length"]
121
+
122
+ postprocess_parameters = {}
123
+ return preprocess_parameters, forward_parameters, postprocess_parameters
124
+
125
+ def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> List:
126
+ # Making input to list format
127
+ if isinstance(input_, str):
128
+ input_ = [input_]
129
+
130
+ # Building token
131
+ github_token = preprocess_parameters["preprocess_parameters"]
132
+ headers = {"Accept": "application/vnd.github+json"}
133
+ token = github_token or self.github_token
134
+ if token:
135
+ headers["Authorization"] = f"Bearer {token}"
136
+
137
+ # Getting repositories' information: input_ means series of repositories
138
+ extracted_infos = extract_information(input_, headers=headers)
139
+
140
+ return extracted_infos
141
+
142
+ def encode(self, text, max_length):
143
+ assert max_length < 1024
144
+
145
+ tokenizer = self.tokenizer
146
+ tokens = (
147
+ [tokenizer.cls_token, "<encoder-only>", tokenizer.sep_token]
148
+ + tokenizer.tokenize(text)[: max_length - 4]
149
+ + [tokenizer.sep_token]
150
+ )
151
+ tokens_id = tokenizer.convert_tokens_to_ids(tokens)
152
+ source_ids = torch.tensor([tokens_id]).to(self.device)
153
+
154
+ token_embeddings = self.model(source_ids)[0]
155
+ sentence_embeddings = token_embeddings.mean(dim=1)
156
+
157
+ return sentence_embeddings
158
+
159
+ def generate_embeddings(self, text_sets, max_length):
160
+ assert max_length < 1024
161
+ return torch.concat([self.encode(text, max_length) for text in text_sets], dim=0) \
162
+ if text_sets is None or len(text_sets) == 0 \
163
+ else torch.zeros((1, 768), device=self.device)
164
+
165
+ def _forward(self, extracted_infos: List, **forward_parameters: Dict) -> List:
166
+ max_length = 512 if forward_parameters["max_length"] is None else forward_parameters["max_length"]
167
+
168
+ model_outputs = []
169
+ num_repos = len(extracted_infos)
170
+ with tqdm(total=num_repos) as progress_bar:
171
+ # For each repository
172
+ for repo_info in extracted_infos:
173
+ repo_name = repo_info["name"]
174
+ info = {
175
+ "name": repo_name,
176
+ "topics": repo_info["topics"],
177
+ "license": repo_info["license"],
178
+ "stars": repo_info["stars"],
179
+ }
180
+ progress_bar.set_description(f"Processing {repo_name}")
181
+
182
+ # Code embeddings
183
+ tqdm.write(f"[*] Generating code embeddings for {repo_name}")
184
+ code_embeddings = self.generate_embeddings(repo_info["codes"], max_length)
185
+ info["code_embeddings"] = code_embeddings.item()
186
+ info["mean_code_embedding"] = torch.mean(code_embeddings, dim=0).item()
187
+
188
+ # Doc embeddings
189
+ tqdm.write(f"[*] Generating doc embeddings for {repo_name}")
190
+ doc_embeddings = self.generate_embeddings(repo_info["docs"], max_length)
191
+ info["doc_embeddings"] = doc_embeddings.item()
192
+ info["mean_doc_embedding"] = torch.mean(doc_embeddings, dim=0).item()
193
+
194
+ # Requirement embeddings
195
+ tqdm.write(f"[*] Generating requirement embeddings for {repo_name}")
196
+ requirement_embeddings = self.generate_embeddings(repo_info["requirements"], max_length)
197
+ info["requirement_embeddings"] = requirement_embeddings.item()
198
+ info["mean_requirement_embedding"] = torch.mean(requirement_embeddings, dim=0).item()
199
+
200
+ # Requirement embeddings
201
+ tqdm.write(f"[*] Generating readme embeddings for {repo_name}")
202
+ readme_embeddings = self.generate_embeddings(repo_info["readmes"], max_length)
203
+ info["readme_embeddings"] = readme_embeddings.item()
204
+ info["mean_readme_embedding"] = torch.mean(readme_embeddings, dim=0).item()
205
+
206
+ progress_bar.update(1)
207
+ model_outputs.append(info)
208
+
209
+ return model_outputs
210
+
211
+ def postprocess(self, model_outputs: List, **postprocess_parameters: Dict) -> List:
212
+ return model_outputs
213
+
214
+
config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Lazyhope/unixcoder-nine-advtest",
3
+ "architectures": [
4
+ "RobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "custom_pipelines": {
10
+ "feature-extraction": {
11
+ "default": {
12
+ "model": {
13
+ "pt": [
14
+ "Lazyhope/unixcoder-nine-advtest",
15
+ "main"
16
+ ]
17
+ }
18
+ },
19
+ "impl": "RepoPipeline.RepoPipeline",
20
+ "pt": [
21
+ "AutoModel"
22
+ ],
23
+ "tf": [],
24
+ "type": "text"
25
+ }
26
+ },
27
+ "eos_token_id": 2,
28
+ "gradient_checkpointing": false,
29
+ "hidden_act": "gelu",
30
+ "hidden_dropout_prob": 0.1,
31
+ "hidden_size": 768,
32
+ "initializer_range": 0.02,
33
+ "intermediate_size": 3072,
34
+ "layer_norm_eps": 1e-05,
35
+ "max_position_embeddings": 1026,
36
+ "model_type": "roberta",
37
+ "num_attention_heads": 12,
38
+ "num_hidden_layers": 12,
39
+ "output_past": true,
40
+ "pad_token_id": 1,
41
+ "position_embedding_type": "absolute",
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.30.2",
44
+ "type_vocab_size": 10,
45
+ "use_cache": true,
46
+ "vocab_size": 51416
47
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "clean_up_tokenization_spaces": true,
12
+ "cls_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "eos_token": {
21
+ "__type": "AddedToken",
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "errors": "replace",
29
+ "mask_token": {
30
+ "__type": "AddedToken",
31
+ "content": "<mask>",
32
+ "lstrip": true,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "model_max_length": 1000000000000000019884624838656,
38
+ "pad_token": {
39
+ "__type": "AddedToken",
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "sep_token": {
47
+ "__type": "AddedToken",
48
+ "content": "</s>",
49
+ "lstrip": false,
50
+ "normalized": true,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ },
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": {
57
+ "__type": "AddedToken",
58
+ "content": "<unk>",
59
+ "lstrip": false,
60
+ "normalized": true,
61
+ "rstrip": false,
62
+ "single_word": false
63
+ }
64
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff