fadliaulawi
commited on
Commit
•
862259b
1
Parent(s):
fb4710e
Generate result with LLM validation
Browse files- app.py +5 -4
- process.py +35 -18
- prompt.py +7 -5
- resources/experiment.ipynb +53 -0
app.py
CHANGED
@@ -33,7 +33,7 @@ uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_m
|
|
33 |
|
34 |
chunk_option = st.selectbox(
|
35 |
'Tokens amounts per process :',
|
36 |
-
(32000, 16000, 8000
|
37 |
)
|
38 |
chunk_overlap = 0
|
39 |
|
@@ -112,14 +112,15 @@ if uploaded_files:
|
|
112 |
dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']]
|
113 |
dataframe.drop_duplicates(['Genes', 'SNPs'], inplace=True)
|
114 |
dataframe.reset_index(drop=True, inplace=True)
|
115 |
-
|
116 |
|
117 |
end_time = datetime.now()
|
118 |
st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
|
119 |
|
120 |
-
st.dataframe(
|
121 |
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
|
122 |
-
|
|
|
123 |
dataframe.to_excel(writer, sheet_name='Original')
|
124 |
writer.close()
|
125 |
|
|
|
33 |
|
34 |
chunk_option = st.selectbox(
|
35 |
'Tokens amounts per process :',
|
36 |
+
(32000, 16000, 8000), key='table_hv'
|
37 |
)
|
38 |
chunk_overlap = 0
|
39 |
|
|
|
112 |
dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']]
|
113 |
dataframe.drop_duplicates(['Genes', 'SNPs'], inplace=True)
|
114 |
dataframe.reset_index(drop=True, inplace=True)
|
115 |
+
cleaned_df, cleaned_llm_df = validate(dataframe)
|
116 |
|
117 |
end_time = datetime.now()
|
118 |
st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes")
|
119 |
|
120 |
+
st.dataframe(cleaned_df)
|
121 |
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
|
122 |
+
cleaned_llm_df.to_excel(writer, sheet_name='Result with LLM')
|
123 |
+
cleaned_df.to_excel(writer, sheet_name='Result')
|
124 |
dataframe.to_excel(writer, sheet_name='Original')
|
125 |
writer.close()
|
126 |
|
process.py
CHANGED
@@ -8,7 +8,7 @@ from langchain.chains.llm import LLMChain
|
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
from langchain_openai import ChatOpenAI
|
10 |
from pdf2image import convert_from_path
|
11 |
-
from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table
|
12 |
from table_detector import detection_transform, device, model, ocr, outputs_to_objects
|
13 |
|
14 |
import io
|
@@ -180,6 +180,7 @@ def get_table(path):
|
|
180 |
|
181 |
def validate(df):
|
182 |
|
|
|
183 |
df = df.fillna('')
|
184 |
df['Genes'] = df['Genes'].str.upper()
|
185 |
df['SNPs'] = df['SNPs'].str.lower()
|
@@ -191,32 +192,48 @@ def validate(df):
|
|
191 |
for s in sym:
|
192 |
if s in gene:
|
193 |
genes = gene.split(s)
|
194 |
-
df.loc[
|
195 |
-
df
|
196 |
-
df.loc[
|
197 |
|
198 |
# Check if there is SNPs without 'rs'
|
199 |
for i in df.index:
|
200 |
safe = True
|
201 |
snp = df.loc[i, 'SNPs']
|
202 |
-
if
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
|
|
212 |
if safe:
|
213 |
df.loc[i, 'SNPs'] = snp
|
214 |
|
215 |
df.reset_index(drop=True, inplace=True)
|
216 |
|
217 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
-
return df
|
|
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
from langchain_openai import ChatOpenAI
|
10 |
from pdf2image import convert_from_path
|
11 |
+
from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table, prompt_validation
|
12 |
from table_detector import detection_transform, device, model, ocr, outputs_to_objects
|
13 |
|
14 |
import io
|
|
|
180 |
|
181 |
def validate(df):
|
182 |
|
183 |
+
df = df[df['Genes'].notna()].reset_index(drop=True)
|
184 |
df = df.fillna('')
|
185 |
df['Genes'] = df['Genes'].str.upper()
|
186 |
df['SNPs'] = df['SNPs'].str.lower()
|
|
|
192 |
for s in sym:
|
193 |
if s in gene:
|
194 |
genes = gene.split(s)
|
195 |
+
df.loc[i + 0.5] = df.loc[i]
|
196 |
+
df = df.sort_index().reset_index(drop=True)
|
197 |
+
df.loc[i, 'Genes'], df.loc[i + 1, 'Genes'] = genes[0], genes[1]
|
198 |
|
199 |
# Check if there is SNPs without 'rs'
|
200 |
for i in df.index:
|
201 |
safe = True
|
202 |
snp = df.loc[i, 'SNPs']
|
203 |
+
if re.fullmatch('rs(\d)+|', snp):
|
204 |
+
pass
|
205 |
+
elif re.fullmatch('ts(\d)+', snp):
|
206 |
+
snp = 't' + snp[1:]
|
207 |
+
elif re.fullmatch('s(\d)+', snp):
|
208 |
+
snp = 'r' + snp
|
209 |
+
elif re.fullmatch('(\d)+', snp):
|
210 |
+
snp = 'rs' + snp
|
211 |
+
else:
|
212 |
+
safe = False
|
213 |
+
df = df.drop(i)
|
214 |
+
|
215 |
if safe:
|
216 |
df.loc[i, 'SNPs'] = snp
|
217 |
|
218 |
df.reset_index(drop=True, inplace=True)
|
219 |
|
220 |
+
# Validate genes and diseases with LLM
|
221 |
+
json_table = df[['Genes', 'SNPs', 'Diseases']].to_json(orient='records')
|
222 |
+
str_json_table = json.dumps(json.loads(json_table), indent=2)
|
223 |
+
|
224 |
+
result = llm_p.invoke(model='mistral-7b-instruct', input=prompt_validation.format(str_json_table)).content
|
225 |
+
print('val')
|
226 |
+
print(result)
|
227 |
|
228 |
+
result = result[result.find('['):result.rfind(']')+1]
|
229 |
+
try:
|
230 |
+
result = eval(result)
|
231 |
+
except SyntaxError:
|
232 |
+
result = []
|
233 |
+
|
234 |
+
df_val = pd.DataFrame(result)
|
235 |
+
df_val = df_val.merge(df.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
|
236 |
+
|
237 |
+
# TODO: How to validate genes and SNPs?
|
238 |
|
239 |
+
return df, df_val
|
prompt.py
CHANGED
@@ -265,8 +265,8 @@ If there is no specific extracted entities provided from the table, just leave t
|
|
265 |
|
266 |
prompt_validation = """
|
267 |
# CONTEXT #
|
268 |
-
In my capacity as a genomics specialist, I have table data containing gene names with
|
269 |
-
The problem is because the data is
|
270 |
|
271 |
This is the data:
|
272 |
{}
|
@@ -274,11 +274,13 @@ This is the data:
|
|
274 |
# OBJECTIVE #
|
275 |
Given the provided table data, the following tasks need to be completed:
|
276 |
|
277 |
-
1. Check whether the gene name is
|
278 |
-
|
|
|
|
|
279 |
|
280 |
# RESPONSE #
|
281 |
-
The output
|
282 |
[
|
283 |
{{
|
284 |
"Genes": "A",
|
|
|
265 |
|
266 |
prompt_validation = """
|
267 |
# CONTEXT #
|
268 |
+
In my capacity as a genomics specialist, I have table data containing gene names with their corresponding SNPs and diseases. The data is provided in a list of JSON format, with each JSON object representing a single row in a tabular structure.
|
269 |
+
The problem is because the data is extracted using OCR, some gene names and SNPs may have a typo.
|
270 |
|
271 |
This is the data:
|
272 |
{}
|
|
|
274 |
# OBJECTIVE #
|
275 |
Given the provided table data, the following tasks need to be completed:
|
276 |
|
277 |
+
1. Check whether the gene name is the correct gene name. If the gene name is suspected of a typo, fix it into the correct form. If the gene name seems like a mistake entirely or invalid, remove the data row. Common errors include:
|
278 |
+
- Combined Names: Two gene names erroneously merged into one. Separate these using "and": "A and B".
|
279 |
+
- OCR Errors: Similar characters misread by the system. Correct these to the intended form.
|
280 |
+
2. If diseases are not empty, check whether the gene name corresponds with the gene names. Fix it with the correct diseases if the original disease is wrong.
|
281 |
|
282 |
# RESPONSE #
|
283 |
+
The output must be only a string containing a list of JSON objects, adhering to the identical structure present in the original input data. Each object representing a validated entry with the following structure:
|
284 |
[
|
285 |
{{
|
286 |
"Genes": "A",
|
resources/experiment.ipynb
CHANGED
@@ -2316,6 +2316,59 @@
|
|
2316 |
"result = llm.invoke(model='llama-3-70b-instruct', input=prompt)\n",
|
2317 |
"print(result.content)"
|
2318 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2319 |
}
|
2320 |
],
|
2321 |
"metadata": {
|
|
|
2316 |
"result = llm.invoke(model='llama-3-70b-instruct', input=prompt)\n",
|
2317 |
"print(result.content)"
|
2318 |
]
|
2319 |
+
},
|
2320 |
+
{
|
2321 |
+
"cell_type": "code",
|
2322 |
+
"execution_count": 2,
|
2323 |
+
"metadata": {},
|
2324 |
+
"outputs": [
|
2325 |
+
{
|
2326 |
+
"name": "stderr",
|
2327 |
+
"output_type": "stream",
|
2328 |
+
"text": [
|
2329 |
+
"c:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py:1429: FutureWarning: The repository for bigbio/euadr contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bigbio/euadr\n",
|
2330 |
+
"You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
|
2331 |
+
"Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
|
2332 |
+
" warnings.warn(\n"
|
2333 |
+
]
|
2334 |
+
},
|
2335 |
+
{
|
2336 |
+
"ename": "ConnectionError",
|
2337 |
+
"evalue": "Couldn't reach https://biosemantics.erasmusmc.nl/downloads/euadr.tgz (ConnectTimeout(MaxRetryError(\"HTTPSConnectionPool(host='biosemantics.erasmusmc.nl', port=443): Max retries exceeded with url: /downloads/euadr.tgz (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CD0D00D060>, 'Connection to biosemantics.erasmusmc.nl timed out. (connect timeout=100)'))\")))",
|
2338 |
+
"output_type": "error",
|
2339 |
+
"traceback": [
|
2340 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
2341 |
+
"\u001b[1;31mConnectionError\u001b[0m Traceback (most recent call last)",
|
2342 |
+
"\u001b[1;32m<ipython-input-2-8057498175ab>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mdatasets\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mdataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"bigbio/euadr\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
2343 |
+
"\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\load.py\u001b[0m in \u001b[0;36mload_dataset\u001b[1;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[0;32m 2547\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2548\u001b[0m \u001b[1;31m# Download and prepare data\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2549\u001b[1;33m builder_instance.download_and_prepare(\n\u001b[0m\u001b[0;32m 2550\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2551\u001b[0m \u001b[0mdownload_mode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
2344 |
+
"\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36mdownload_and_prepare\u001b[1;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[0;32m 1003\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mnum_proc\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1004\u001b[0m \u001b[0mprepare_split_kwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"num_proc\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnum_proc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1005\u001b[1;33m self._download_and_prepare(\n\u001b[0m\u001b[0;32m 1006\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1007\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
2345 |
+
"\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_splits_kwargs)\u001b[0m\n\u001b[0;32m 1765\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1766\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_download_and_prepare\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mprepare_splits_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1767\u001b[1;33m super()._download_and_prepare(\n\u001b[0m\u001b[0;32m 1768\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1769\u001b[0m \u001b[0mverification_mode\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
2346 |
+
"\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[1;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[0;32m 1076\u001b[0m \u001b[0msplit_dict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSplitDict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1077\u001b[0m \u001b[0msplit_generators_kwargs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_split_generators_kwargs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprepare_split_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1078\u001b[1;33m \u001b[0msplit_generators\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_split_generators\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0msplit_generators_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1079\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1080\u001b[0m \u001b[1;31m# Checksums verification\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
2347 |
+
"\u001b[1;32m~\\.cache\\huggingface\\modules\\datasets_modules\\datasets\\bigbio--euadr\\38388d88a335f2d91807b0f813bdfd809fec0e9dcbc32e2d9bfea7275d70f75c\\euadr.py\u001b[0m in \u001b[0;36m_split_generators\u001b[1;34m(self, dl_manager)\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_split_generators\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 106\u001b[0m \u001b[0murls\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_URL\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 107\u001b[1;33m \u001b[0mdatapath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload_and_extract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 108\u001b[0m return [\n\u001b[0;32m 109\u001b[0m datasets.SplitGenerator(\n",
|
2348 |
+
"\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36mdownload_and_extract\u001b[1;34m(self, url_or_urls)\u001b[0m\n\u001b[0;32m 560\u001b[0m \u001b[0mextracted_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;31m`\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mextracted\u001b[0m \u001b[0mpaths\u001b[0m \u001b[0mof\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mURL\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 561\u001b[0m \"\"\"\n\u001b[1;32m--> 562\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mextract\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_urls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 563\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 564\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mget_recorded_sizes_checksums\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
2349 |
+
"\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36mdownload\u001b[1;34m(self, url_or_urls)\u001b[0m\n\u001b[0;32m 424\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 425\u001b[0m \u001b[0mstart_time\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 426\u001b[1;33m downloaded_path_or_paths = map_nested(\n\u001b[0m\u001b[0;32m 427\u001b[0m \u001b[0mdownload_func\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 428\u001b[0m \u001b[0murl_or_urls\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
2350 |
+
"\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\py_utils.py\u001b[0m in \u001b[0;36mmap_nested\u001b[1;34m(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, types, disable_tqdm, desc)\u001b[0m\n\u001b[0;32m 457\u001b[0m \u001b[1;31m# Singleton\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 458\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 459\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 460\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 461\u001b[0m \u001b[0miterable\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32melse\u001b[0m \u001b[0mdata_struct\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
2351 |
+
"\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\download\\download_manager.py\u001b[0m in \u001b[0;36m_download\u001b[1;34m(self, url_or_filename, download_config)\u001b[0m\n\u001b[0;32m 449\u001b[0m \u001b[1;31m# append the relative path to the base_path\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 450\u001b[0m \u001b[0murl_or_filename\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0murl_or_path_join\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_base_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 451\u001b[1;33m \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcached_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 452\u001b[0m \u001b[0mout\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtracked_str\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 453\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mset_origin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
2352 |
+
"\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\file_utils.py\u001b[0m in \u001b[0;36mcached_path\u001b[1;34m(url_or_filename, download_config, **download_kwargs)\u001b[0m\n\u001b[0;32m 186\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mis_remote_url\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 187\u001b[0m \u001b[1;31m# URL, so get it from the cache (downloading if necessary)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 188\u001b[1;33m output_path = get_from_cache(\n\u001b[0m\u001b[0;32m 189\u001b[0m \u001b[0murl_or_filename\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 190\u001b[0m \u001b[0mcache_dir\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
2353 |
+
"\u001b[1;32mc:\\Users\\Fadli\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\datasets\\utils\\file_utils.py\u001b[0m in \u001b[0;36mget_from_cache\u001b[1;34m(url, cache_dir, force_download, proxies, etag_timeout, resume_download, user_agent, local_files_only, use_etag, max_retries, token, use_auth_token, ignore_url_params, storage_options, download_desc)\u001b[0m\n\u001b[0;32m 571\u001b[0m \u001b[0m_raise_if_offline_mode_is_enabled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Tried to reach {url}\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 572\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhead_error\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 573\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Couldn't reach {url} ({repr(head_error)})\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 574\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 575\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34mf\"Couldn't reach {url} (error {response.status_code})\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
2354 |
+
"\u001b[1;31mConnectionError\u001b[0m: Couldn't reach https://biosemantics.erasmusmc.nl/downloads/euadr.tgz (ConnectTimeout(MaxRetryError(\"HTTPSConnectionPool(host='biosemantics.erasmusmc.nl', port=443): Max retries exceeded with url: /downloads/euadr.tgz (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000001CD0D00D060>, 'Connection to biosemantics.erasmusmc.nl timed out. (connect timeout=100)'))\")))"
|
2355 |
+
]
|
2356 |
+
}
|
2357 |
+
],
|
2358 |
+
"source": [
|
2359 |
+
"from datasets import load_dataset\n",
|
2360 |
+
"\n",
|
2361 |
+
"dataset = load_dataset(\"bigbio/euadr\")"
|
2362 |
+
]
|
2363 |
+
},
|
2364 |
+
{
|
2365 |
+
"cell_type": "code",
|
2366 |
+
"execution_count": null,
|
2367 |
+
"metadata": {},
|
2368 |
+
"outputs": [],
|
2369 |
+
"source": [
|
2370 |
+
"dataset"
|
2371 |
+
]
|
2372 |
}
|
2373 |
],
|
2374 |
"metadata": {
|