File size: 6,170 Bytes
2652d5f 7f79d79 2652d5f 7f79d79 94d4492 7f79d79 722e35c 94d4492 d0947a0 ddb3d2c 2652d5f d0947a0 63740cd 11412ae d0947a0 2652d5f 92b5e51 2652d5f d0947a0 2652d5f d0947a0 2652d5f 8bf4041 2652d5f d0947a0 2652d5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import streamlit as st
import os
import subprocess
import sys
import shutil
from Crypto.PublicKey import RSA
from datasets import load_dataset
import pandas as pd
import oci
from oci import object_storage
from oci.object_storage.models import CreateBucketDetails
from oci.object_storage.models import CreatePreauthenticatedRequestDetails
import pickle
from PIL import Image
st.set_page_config(page_title="Oracle")
image = Image.open('oracle_huggingface.png')
st.image(image)
st.title("Oracle x HF")
st.header("Upload HF Dataset to OCI Object Storage!")
st.caption("The first of many features and integrations between HF and OCI")
config_location = ".oci/config"
user_ocid = ""
tenancy_ocid = ""
region = ""
gen_api_key = "n"
private_key_location = ".oci/private_key.pem"
hf_dataset = ""
oracle_form = st.form("configuration")
oracle_form.write("OCI Settings")
user_ocid = oracle_form.text_input("Enter the User OCID", help="To find your user OCID, first login to your OCI account. From the home screen, click on the top right face icon marked as profile and click on your email. From here, find and copy your OCID into this box")
tenancy_ocid = oracle_form.text_input("Enter the tenancy ocid", help="To find your tenancy OCID, first login to your OCI account. From the home screen, click on the top right face icon marked as profile and click on your tenancy. From here, find and copy your OCID into this box")
region = oracle_form.text_input("Enter the region", "us-ashburn-1", help="To find your region identifier, first login to your OCI account. From the home screen, click on the top right region name. Scroll to the bottom of the regions list to the option labeled as manage region. Within this, the list of region identifiers is available. Copy your corresponding region identifier into here.")
existing_checkbox = oracle_form.checkbox("Check this if you want to put the dataset into an existing bucket rather than create a new one")
oracle_submitted = oracle_form.form_submit_button("Generate API Key")
oracle_form.write("After the API Key is generated, go to the same screen as where you got your user_ocid from. Scroll down to the bottom left and look for the API Key section under the Resources tab . Click on this, and then click the Add API Key button. Click on the Paste Public Key option. Copy and Paste the Generated API Key")
dataset_form = st.form("dataset")
dataset_form.write("Dataset Settings")
dataset_name = dataset_form.text_area("Enter the name of the huggingface Dataset:", value = "biosses", help ="Datasets can be found here: https://huggingface.co/datasets The default, BIOSSES is a benchmark dataset for biomedical sentence similarity estimation.")
dataset_name_2 = dataset_form.text_area("Enter the name of the config for the dataset if it has one", value = " ", help="Some datasets have config options associated with them, enter them here")
split_name = dataset_form.text_area("Enter the name of the split of the dataset that you want to use", value = "train")
pd_checkbox = dataset_form.checkbox("Check this if you want this to be uploaded as a pandas dataframe instead of a HF Dataset Object")
dataset_submitted = dataset_form.form_submit_button("Upload Dataset to OCI Object Storage")
def load_and_process_data(path, name, streaming, split_name):
dataset = load_dataset(path = path, name = name, streaming=streaming, keep_in_memory = True)
#return list(dataset)
dataset_head = dataset[split_name]
return dataset_head
if oracle_submitted:
input_str = config_location + "\n" + "Y" + "\n" + "USER" + "\n" + user_ocid + "\n" + tenancy_ocid + "\n" + region + "\n" + gen_api_key + "\n" + private_key_location
key_input_str = " \n" + " \n"
try:
shutil.rmtree(".oci")
except Exception:
pass
try:
os.mkdir(".oci")
except FileExistsError:
pass
open(".oci/config", "a").close()
key = RSA.generate(2048)
private_key = key.export_key()
file_out = open(".oci/private_key.pem", "wb")
file_out.write(private_key)
file_out.close()
public_key = key.publickey().export_key()
file_out = open(".oci/public_key.pem", "wb")
file_out.write(public_key)
file_out.close()
p = subprocess.run(["oci", "setup", "config"], text = True, input = input_str)
cat_public = subprocess.run(["cat", ".oci/public_key.pem"], text = True, capture_output=True)
cat_config = subprocess.run(["cat", ".oci/config"], text = True, capture_output=True)
oracle_form.text(cat_public.stdout) ###took me SIX HOURS of debugging to figure out that the st.write command is NOT the right way to output an RSA key :(
with oracle_form.expander("Open to see the generated OCI config file"):
oracle_form.text(cat_config.stdout)
if dataset_submitted:
hf_dataset = load_and_process_data(dataset_name, dataset_name_2, False, split_name)
if pd_checkbox:
hf_dataset = pd.DataFrame.from_dict(hf_dataset)
st.write(hf_dataset)
st.write("Dataset Pulled Succesfully!")
oci_config = oci.config.from_file(".oci/config", profile_name = "USER")
object_storage = object_storage.ObjectStorageClient(oci_config)
st.write("Object Storage Connected Succesfully")
namespace = object_storage.get_namespace().data
compartment_id = oci_config["tenancy"]
st.write(namespace)
bucket_name = dataset_name.replace("/", "-")
try:
bucket = object_storage.create_bucket(
namespace,
oci.object_storage.models.CreateBucketDetails(
name=bucket_name,
compartment_id=compartment_id,
storage_tier='Archive',
public_access_type='ObjectRead'
)
)
st.write("Bucket Written:")
st.write(bucket)
except Exception:
st.write("Bucket Exists, Writing Dataset to Bucket")
st.write("Uploading new object if it doesn't already exist {!r}".format(hf_dataset))
hf_bytes = pickle.dumps(hf_dataset)
obj = object_storage.put_object(
namespace,
bucket_name,
bucket_name,
hf_bytes)
st.write("Object Uploaded Successfully!")
#if test_connection_button:
|