Spaces:

LeMaterial
/

materials_explorer

Running

Ramlaoui commited on 16 days ago

Commit

e690d34

•

1 Parent(s): b0471af

Vectorized preprocessing

Files changed (1) hide show

create_index.py CHANGED Viewed

@@ -3,7 +3,6 @@ import re
 import numpy as np
 import periodictable
-import tqdm
 from datasets import load_dataset
 HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -41,15 +40,26 @@ map_periodic_table = {v.symbol: k for k, v in enumerate(periodictable.elements)}
 dataset_index = np.zeros((len(dataset), 118))
-for i, row in tqdm.tqdm(enumerate(dataset), total=len(dataset)):
-    for el in row["chemical_formula_descriptive"].split(" "):
-        matches = re.findall(r"([a-zA-Z]+)([0-9]*)", el)
-        el = matches[0][0]
-        numb = int(matches[0][1]) if matches[0][1] else 1
-        dataset_index[i][map_periodic_table[el]] = numb
-    dataset_index[i] = dataset_index[i] / np.sum(dataset_index[i])
 dataset_index = (
     dataset_index / np.linalg.norm(dataset_index, axis=1)[:, None]
 )  # Normalize vectors

 import numpy as np
 import periodictable
 from datasets import load_dataset
 HF_TOKEN = os.environ.get("HF_TOKEN")
 dataset_index = np.zeros((len(dataset), 118))
+train_df = dataset.to_pandas()
+pattern = re.compile(r"(?P<element>[A-Z][a-z]?)(?P<count>\d*)")
+extracted = train_df["chemical_formula_descriptive"].str.extractall(pattern)
+extracted["count"] = extracted["count"].replace("", "1").astype(int)
+wide_df = extracted.reset_index().pivot_table(  # Move index to columns for pivoting
+    index="level_0",  # original row index
+    columns="element",
+    values="count",
+    aggfunc="sum",
+    fill_value=0,
+)
+all_elements = [el.symbol for el in periodictable.elements]  # full element list
+wide_df = wide_df.reindex(columns=all_elements, fill_value=0)
+dataset_index = wide_df.values
+dataset_index = dataset_index / np.sum(dataset_index, axis=1)[:, None]
 dataset_index = (
     dataset_index / np.linalg.norm(dataset_index, axis=1)[:, None]
 )  # Normalize vectors