Spaces:
Running
Running
Vectorized preprocessing
Browse files- create_index.py +18 -8
create_index.py
CHANGED
@@ -3,7 +3,6 @@ import re
|
|
3 |
|
4 |
import numpy as np
|
5 |
import periodictable
|
6 |
-
import tqdm
|
7 |
from datasets import load_dataset
|
8 |
|
9 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
@@ -41,15 +40,26 @@ map_periodic_table = {v.symbol: k for k, v in enumerate(periodictable.elements)}
|
|
41 |
|
42 |
|
43 |
dataset_index = np.zeros((len(dataset), 118))
|
|
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
el = matches[0][0]
|
49 |
-
numb = int(matches[0][1]) if matches[0][1] else 1
|
50 |
-
dataset_index[i][map_periodic_table[el]] = numb
|
51 |
-
dataset_index[i] = dataset_index[i] / np.sum(dataset_index[i])
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
dataset_index = (
|
54 |
dataset_index / np.linalg.norm(dataset_index, axis=1)[:, None]
|
55 |
) # Normalize vectors
|
|
|
3 |
|
4 |
import numpy as np
|
5 |
import periodictable
|
|
|
6 |
from datasets import load_dataset
|
7 |
|
8 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
40 |
|
41 |
|
42 |
dataset_index = np.zeros((len(dataset), 118))
|
43 |
+
train_df = dataset.to_pandas()
|
44 |
|
45 |
+
pattern = re.compile(r"(?P<element>[A-Z][a-z]?)(?P<count>\d*)")
|
46 |
+
extracted = train_df["chemical_formula_descriptive"].str.extractall(pattern)
|
47 |
+
extracted["count"] = extracted["count"].replace("", "1").astype(int)
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
wide_df = extracted.reset_index().pivot_table( # Move index to columns for pivoting
|
50 |
+
index="level_0", # original row index
|
51 |
+
columns="element",
|
52 |
+
values="count",
|
53 |
+
aggfunc="sum",
|
54 |
+
fill_value=0,
|
55 |
+
)
|
56 |
+
|
57 |
+
all_elements = [el.symbol for el in periodictable.elements] # full element list
|
58 |
+
wide_df = wide_df.reindex(columns=all_elements, fill_value=0)
|
59 |
+
|
60 |
+
dataset_index = wide_df.values
|
61 |
+
|
62 |
+
dataset_index = dataset_index / np.sum(dataset_index, axis=1)[:, None]
|
63 |
dataset_index = (
|
64 |
dataset_index / np.linalg.norm(dataset_index, axis=1)[:, None]
|
65 |
) # Normalize vectors
|