# X-LoRA Inference: Gemma-7b model for molecular design 


### Helper functions 

In [None]:
import os
import random

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
from datasets import load_dataset
from datasets import IterableDataset

from transformers import Trainer
from transformers import TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import TrainerCallback
from transformers import AutoConfig
from transformers import BitsAndBytesConfig

from peft import LoraConfig, get_peft_model
from torch.utils.data import Dataset
from transformers import get_linear_schedule_with_warmup
from accelerate import infer_auto_device_map
import math
import numpy as np
import unidecode
import pandas as pd
from matplotlib import pyplot as plt
import peft

from tqdm.notebook import tqdm

device='cuda'

def params(model):
 model_parameters = filter(lambda p: p.requires_grad, model.parameters())
 params = sum([np.prod(p.size()) for p in model_parameters])

 print("Number of model arameters: ", params) 

def generate_response (model,tokenizer,text_input="Biology offers amazing",
 num_return_sequences=1,
 temperature=1., #the higher the temperature, the more creative the model becomes
 max_new_tokens=127,
 num_beams=1,
 top_k = 50,
 top_p =0.9,repetition_penalty=1.,eos_token_id=107,verbatim=False,
 exponential_decay_length_penalty_fac=None,add_special_tokens =True, eos_token=None, 
 ):

 if eos_token==None:
 eos_token=tokenizer('', add_special_tokens =False, ) ['input_ids'][0]
 
 inputs = tokenizer(text_input, 
 add_special_tokens =add_special_tokens, 
 return_tensors ='pt').to(device)
 if verbatim:
 print ("Length of input, tokenized: ", inputs["input_ids"].shape, inputs["input_ids"],"eos_token: ", eos_token)
 with torch.no_grad():
 outputs = model.generate(#input_ids=inputs.to(device), 
 input_ids = inputs["input_ids"],
 attention_mask = inputs["attention_mask"] , # This is usually done automatically by the tokenizer
 max_new_tokens=max_new_tokens,
 temperature=temperature, #value used to modulate the next token probabilities.
 num_beams=num_beams,
 top_k = top_k,
 top_p = top_p,
 num_return_sequences = num_return_sequences,
 eos_token_id=eos_token,
 pad_token_id = eos_token,
 do_sample =True, 
 repetition_penalty=repetition_penalty, 
 )

 return tokenizer.batch_decode(outputs[:,inputs["input_ids"].shape[1]:].detach().cpu().numpy(), skip_special_tokens=True)

def generate_answer (model,tokenizer,system='You a helpful assistant. You are familiar with materials science. ',
 q='What is spider silk in the context of bioinspired materials?',
 repetition_penalty=1.1,
 top_p=0.1, top_k=32, 
 temperature=.6,max_new_tokens=512, verbatim=False, eos_token=None,add_special_tokens=True,
 prepend_response='', messages=[],
 ):

 if eos_token==None:
 eos_token= tokenizer.eos_token_id
 
 if system==None:
 messages.append ({"role": "user", "content": q} )
 else:
 messages.append ({"role": "user", "content": system+q})
 
 txt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, )
 txt=txt+prepend_response
 
 output_text=generate_response (model,tokenizer,text_input=txt,eos_token_id=eos_token,
 num_return_sequences=1, repetition_penalty=repetition_penalty,
 top_p=top_p, top_k=top_k, add_special_tokens =add_special_tokens,
 
 temperature=temperature,max_new_tokens=max_new_tokens, verbatim=verbatim, 
 
 )
 return ( output_text[0] )

### Load X-LoRA Gemma model 

In [None]:
import torch
from xlora.xlora_utils import load_model 

XLoRa_model_name = 'lamm-mit/x-lora-gemma-7b'

model, tokenizer=load_model(model_name = XLoRa_model_name, 
 device='cuda:0',
 use_flash_attention_2=True, 
 dtype=torch.bfloat16,
 )
eos_token_id= tokenizer('', add_special_tokens=False, ) ['input_ids'][0]


### Inference using Guidance 

In [None]:
from guidance import models
from guidance import gen, select, system, user, assistant, newline
from IPython.display import display, Markdown

gpt = models.TransformersChat(model=model, tokenizer=tokenizer)
gpt_question_asker = gpt

In [None]:
with user(): 
 lm =gpt + f"""List the most important biomolecules used in biological materials to make polymers with multifunctional qualities.""" 

with assistant(): 
 lm+="["+gen('res1', max_tokens=1024)

### Inference using Hugging Face generate functions 

In [None]:
system_prompt='You are an expert in biological molecular engineering. '
q="""
What are potential molecular engineering approaches to create better materials? Name specific molecules of interest.
"""

res=generate_answer (model, tokenizer,system=system_prompt,
 q=q,
 repetition_penalty=1., top_p=0.9, top_k=256, 
 temperature=.5,max_new_tokens=512, verbatim=False, 
 )

display (Markdown ("## X-LoRA:\n\n"+res))

In [None]:
system_prompt='You are an expert in biological molecular engineering. '
q="""
List the most important biomolecules used in biological materials to make polymers with multifunctional qualities.
"""
messages=[]
res=generate_answer (model, tokenizer,system=system_prompt,
 q=q, repetition_penalty=1., top_p=0.9, top_k=256, temperature=.5,max_new_tokens=512, verbatim=False,messages=messages )

display (Markdown ("## X-LoRA:\n\n"+res))
messages.append ({"role": "assistant", "content": res} )

In [None]:
system_prompt=None
q="""
How does chitin form a material, specifically in terms of molecular interactions? 
""" 
res=generate_answer (model, tokenizer,system=system_prompt,
 q=q, repetition_penalty=1., top_p=0.9, top_k=256, temperature=.1,max_new_tokens=512, verbatim=False,messages=messages,
 )

display (Markdown ("## X-LoRA:\n\n"+res))
messages.append ({"role": "assistant", "content": res} )

In [None]:
system_prompt=None
q="""
Thank you. What are potential chemical modifications of N-acetylglucosamine units that would improve mechanical properties?
""" 
res=generate_answer (model, tokenizer,system=system_prompt,
 q=q, repetition_penalty=1., top_p=0.9, top_k=256, temperature=.1,max_new_tokens=512, verbatim=False,messages=messages,
 )

display (Markdown ("## X-LoRA:\n\n"+res))
messages.append ({"role": "assistant", "content": res} )

### Molecule design examples

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

df_smiles=pd.read_csv ('./QM9.csv')
SMILES_LIST=list (df_smiles['smiles'])

X = df_smiles.iloc[:, 0].values.reshape(-1, 1) # Input feature, reshaped for compatibility
y = df_smiles.iloc[:, 1:] # Target features

# Scaling the target features
scaler = MinMaxScaler()
y_scaled = scaler.fit_transform(y)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, y_scaled, test_size=0.2, random_state=42)

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
labels = ["mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "cv", "u0", "u298", "h298", "g298"]

def return_str(vals=np.array ([.1, .5, .6, 2.])):
 ch=''
 for i in range (len (vals)):
 ch=ch+f'{vals[i]:1.3f},'
 
 return ch[:-1] 

def extract_start_and_end(string_input, start_token='[', end_token=']'):
 """
 Extracts the substring from 'string_input' that is enclosed between the first occurrence of
 'start_token' and the last occurrence of 'end_token'.

 Args:
 string_input (str): The string from which to extract the substring.
 start_token (str): The starting delimiter. Default is '['.
 end_token (str): The ending delimiter. Default is ']'.

 Returns:
 str: The extracted substring. If 'start_token' or 'end_token' is not found, returns an empty string.
 """
 # Find the index of the first occurrence of start_token
 i = string_input.find(start_token)
 # Find the index of the last occurrence of end_token
 j = string_input.rfind(end_token)

 # Check if both tokens are found and i < j to ensure proper enclosure
 if i == -1 or j == -1 or i >= j:
 return ""
 else:
 # Extract and return the content between the first start_token and the last end_token
 return string_input[i + 1:j]

def is_SMILES_novel (SMILES, SMILES_LIST=None):

 if SMILES_LIST !=None:
 
 if SMILES not in SMILES_LIST:
 is_novel=True
 else:
 is_novel=False
 else:
 is_novel=None
 return is_novel
 
def visualize_SMILES (smiles_code, dir_path='./' , root='', sample_count=0):
 molecule = Chem.MolFromSmiles(smiles_code)
 
 # Generate an image of the molecule
 molecule_image = Draw.MolToImage(molecule)
 
 # Display the image directly in Jupyter Notebook
 display(molecule_image)
 
 image_path=f"{dir_path}/SMILES_{sample_count}_{root}_molecule_image.png"
 molecule_image.save(image_path)

 return image_path


def design_from_target(
 model,
 tokenizer,
 target,
 temperature=0.1,
 num_beams=1,
 top_k=50,
 top_p=0.95,
 repetition_penalty=1.0,
 messages=[]
):
 # Format the target line for molecular property generation
 line = f'GenerateMolecularProperties<{return_str(target)}>'
 
 # Add the line to the message history
 messages.append({"role": "user", "content": line})
 
 # Apply chat template with optional tokenization
 line = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 # Generate response with specified parameters
 result = generate_response(
 model,
 tokenizer,
 text_input=line,
 num_return_sequences=1,
 temperature=temperature,
 top_k=top_k,
 top_p=top_p,
 max_new_tokens=256
 )[0]
 
 return result

def properties_from_SMILES(
 model,
 tokenizer,
 target,
 temperature=0.1,
 top_k=128,
 top_p=0.9,
 num_beams=1,
 repetition_penalty=1.0
):
 # Format the target line for molecular property calculation
 line = f'CalculateMolecularProperties<{target}>'
 
 # Initialize messages and add the formatted line
 messages = [{"role": "user", "content": line}]
 
 # Apply chat template with optional tokenization
 line = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 # Generate response with specified parameters
 result = generate_response(
 model,
 tokenizer,
 text_input=line,
 num_return_sequences=1,
 temperature=temperature,
 top_k=top_k,
 top_p=top_p,
 max_new_tokens=256
 )[0]
 
 # Extract relevant part of the result and convert to float list
 result = extract_start_and_end(result, start_token='[', end_token=']')
 return [float(i) for i in result.split(',')]

 
def avg_properties_from_SMILES (model, tokenizer, SMILES ='O=C(N)C1OC(CO)C(O)C(O)C1O', SMILES_dir='./',
 temperature=0.01, top_k=50,top_p=0.95, num_beams=1, repetition_penalty=1.,
 labels=None, N_prop=6, plot_results=True):
 if not os.path.exists(SMILES_dir):
 os.makedirs(SMILES_dir) 
 properties=[]
 if labels==None and plot_results:
 labels= ['mu',
 'alpha',
 'homo',
 'lumo',
 'gap',
 'r2',
 'zpve',
 'cv',
 'u0',
 'u298',
 'h298',
 'g298']
 successful=0
 for i in tqdm(range (N_prop)):
 
 try:
 _prop=properties_from_SMILES (model, tokenizer, SMILES,temperature=temperature, top_k=top_k,top_p=top_p,
 num_beams=num_beams, repetition_penalty=repetition_penalty,
 )
 if len (_prop)==len (labels):
 
 properties.append(np.array( _prop) )
 successful+=1
 except:
 print (end="")
 
 all_properties = np.array(properties)
 
 # Calculate mean and standard deviation for each property
 means = np.mean(all_properties, axis=0)
 std_devs = np.std(all_properties, axis=0)
 
 # Labels for the x-axis
 if plot_results: 
 # Creating the plot with error bars
 plt.figure(figsize=(6, 4))
 plt.errorbar(labels, means, yerr=std_devs, fmt='o', ecolor='red', capsize=5, capthick=2, marker='s', color='blue')
 plt.xticks(rotation=45)
 plt.xlabel('Property')
 plt.ylabel('Value')
 plt.title('Average Properties with Error Bars')
 plt.tight_layout()
 plt.savefig(SMILES_dir + f"avg_prop_{SMILES}.svg", format="svg")
 
 plt.show()
 print (f"Successful attempts: {successful}/{N_prop}")
 
 return means, std_devs 

def is_valid_smiles(smiles):
 # This function tries to create a molecule object from a SMILES string.
 # If the molecule object is created successfully and is not None, the SMILES is valid.
 mol = Chem.MolFromSmiles(smiles)
 return mol is not None
 
def design_molecule(model, tokenizer, target=None, temperature=0.1,
 num_beams=1,top_k=50,top_p=0.95, repetition_penalty=1.,
 SMILES_LIST=None, dir_path='./', messages=[],N_attempts_for_forward=1):

 if not os.path.exists(dir_path):
 os.makedirs(dir_path)
 if target.any()==None:
 target = np.random.rand(12)
 
 try:
 SMILES=design_from_target (model, tokenizer, target, messages=messages)
 except:
 SMILES=None
 print ("Generation failed.")

 is_novel=is_SMILES_novel (SMILES, SMILES_LIST)
 print ("Result: ", SMILES, "is novel: ", is_novel, "is valid: ", is_valid_smiles(SMILES))
 try:
 visualize_SMILES (SMILES, dir_path=dir_path)
 except:
 print ("Vis failed.")

 try:
 if N_attempts_for_forward==1:
 predicted = properties_from_SMILES(model, tokenizer, SMILES,temperature_pred, num_beams,
 top_k, top_p, repetition_penalty)
 else:
 predicted,_=avg_properties_from_SMILES(model, tokenizer, SMILES, SMILES_dir=SMILES_dir,
 temperature=temperature_pred, top_k=top_k,top_p=top_p, num_beams=num_beams, repetition_penalty=repetition_penalty,
 labels=labels, N_prop=N_attempts_for_forward, plot_results=False)

 sns.set_style("whitegrid")
 plt.gcf().set_facecolor('white')
 # Assuming GT_res and predictions are your data arrays/lists for Ground Truth and Predictions respectively
 
 x = np.arange(len(labels)) # Label locations
 width = 0.35 # Width of the bars
 
 fig, ax = plt.subplots(figsize=(9, 5))
 rects1 = ax.bar(x - width/2, target, width, label='Target')
 rects2 = ax.bar(x + width/2, predicted, width, label='Predicted properties')
 
 # Add some text for labels, title and custom x-axis tick labels, etc.
 ax.set_ylabel('Values')
 ax.set_title('Comparison of Target and Predicted Properties')
 ax.set_xticks(x)
 ax.set_xticklabels(labels, rotation=45, ha="right")
 ax.legend()

 except:
 print("Forward anaysis failed.")
 return SMILES, is_novel

def design_molecule_loop(model, tokenizer, target=None, temperature_gen=0.3,temperature_pred=0.01, SMILES_LIST=None,
 top_k=50, top_p=0.95, repetition_penalty=1., num_beams=1,update_primer_with_better_draft=False,
 threshold=0.01, N_max=100, dir_path='./',lower_bound = 0.0,remove_duplicates=True,
 upper_bound = 0.1,sample_count=0, messages=[], N_attempts_for_forward=1, set_opt=None):

 mse_smallest_current=9999
 if not os.path.exists(dir_path):
 os.makedirs(dir_path)
 if target is None or not target.any():
 target = np.random.rand(12)

 if len (messages) >0:
 print ("Using primed generation:\n", messages)
 
 records = [] # To store SMILES, properties, and MSE
 for iteration in range(N_max):
 try:
 print (f">>> Iteration={iteration}")
 original_messages=copy.deepcopy (messages)

 SMILES = design_from_target(model, tokenizer, target, temperature_gen, num_beams,
 top_k, top_p, repetition_penalty, messages=original_messages)
 is_novel=is_SMILES_novel (SMILES, SMILES_LIST)

 if is_novel and is_valid_smiles(SMILES):
 print (f"{SMILES} is novel: {is_novel}", "is valid: ", {is_valid_smiles(SMILES)})
 if N_attempts_for_forward==1:
 predicted = properties_from_SMILES(model, tokenizer, SMILES,temperature_pred, num_beams,
 top_k, top_p, repetition_penalty)
 else:
 predicted,_=avg_properties_from_SMILES(model, tokenizer, SMILES, SMILES_dir=dir_path,
 temperature=temperature_pred, top_k=top_k,top_p=top_p, repetition_penalty=repetition_penalty,
 labels=labels, N_prop=N_attempts_for_forward, plot_results=False)

 if set_opt==None:
 mse = mean_squared_error(target, predicted)
 else:
 mse = mean_squared_error(target[set_opt], predicted[set_opt])
 if mse>>Iteration={iteration}, MSE={mse} for SMILES={SMILES}, novel={is_novel}")
 if mse < threshold:
 print(f"Threshold met at iteration {iteration+1}")
 break
 else:
 print (f"{SMILES} is not novel or not valid, validity: {is_valid_smiles(SMILES)}.")
 except Exception as e:
 print(f"Error during iteration {iteration+1}: {e}")
 continue

 # Sorting records based on MSE (most accurate first)
 records.sort(key=lambda x: x[2])

 # Visualizing the best performing molecule
 best_SMILES, best_predicted, best_mse, is_novel = records[0]

 print ("Best SILES: ", best_SMILES)
 try:
 print (f"{best_SMILES} is novel: {is_novel}")
 
 sns.set_style("whitegrid")
 
 visualize_pred_vs_target (target, best_predicted, labels, dir_path=dir_path, best_SMILES=best_SMILES,sample_count=0)
 
 print(f"Process completed. Results saved to {csv_path}.") 
 visualize_SMILES(best_SMILES, dir_path=dir_path, root=f'{target}_BEST')

 print(f"Compute molecular structure, UFF eq, Gasteiger, etc.") 
 
 compute_gasteiger (best_SMILES, SMILES_dir=dir_path, target= np.array(best_predicted))

 mol = Chem.MolFromSmiles(best_SMILES)
 inchi_str = Chem.MolToInchi(mol)
 print(f"InChI String of {best_SMILES}:", inchi_str)
 
 
 except Exception as e:
 print(f"Processing/visualization failed for {best_SMILES}: {e}")

 # Writing records to a CSV file
 df = pd.DataFrame(records, columns=['SMILES', 'Predicted Properties', 'MSE', 'is_novel'])
 csv_path = os.path.join(dir_path, 'SMILES_designs.csv')
 df.to_csv(csv_path, index=False)

 # Plot MSE against the index (which now corresponds to the ranking)
 plt.figure(figsize=(10, 8)) # Adjust the size as needed
 plt.plot(df['SMILES'], df['MSE'], 'o', markersize=5) # 'o' for circular markers
 
 # Adding labels for each point with the SMILES string
 for i, txt in enumerate(df['SMILES']):
 plt.annotate(txt, (i, df['MSE'].iloc[i]), fontsize=8, rotation=45, ha='right')
 
 visualize_over_SMILES (df,N_max=N_max,SMILES_dir=SMILES_dir,
 lower_bound = lower_bound,remove_duplicates=remove_duplicates,
 upper_bound = upper_bound, target=target)
 return df 

from rdkit import Chem
from rdkit.Chem import Draw
import os

def visualize_smiles_and_save(smiles_list, per_row=4, dir_path='./', root=''):
 """
 Visualizes a list of molecules from their SMILES strings with labels, checks for validity, 
 and saves the visualization as an SVG file.
 
 Parameters:
 - smiles_list: List of SMILES strings to visualize.
 - per_row: Number of molecule images per row in the assembly.
 - dir_path: Directory path where the SVG file will be saved.
 """
 if not os.path.exists(dir_path):
 os.makedirs(dir_path)
 valid_molecules = []
 valid_smiles = [] # To store valid SMILES strings for labeling
 for smile in smiles_list:
 mol = Chem.MolFromSmiles(smile)
 if mol: # If the molecule is valid
 valid_molecules.append(mol)
 valid_smiles.append(smile) # Add the valid SMILES string
 
 # Proceed only if there are valid molecules
 if not valid_molecules:
 print("No valid molecules found in the provided SMILES strings.")
 return
 
 # Ensure the directory exists
 if not os.path.exists(dir_path):
 os.makedirs(dir_path)
 
 # Define the SVG file path
 svg_file_path = os.path.join(dir_path, f'molecules_with_labels_{root}.svg')
 
 # Use RDKit to draw the molecules grid with labels
 fig = Draw.MolsToGridImage(valid_molecules, molsPerRow=per_row, subImgSize=(200, 200), 
 legends=valid_smiles, useSVG=True)
 
 # Saving the SVG content to a file
 with open(svg_file_path, 'w') as svg_file:
 svg_file.write(fig.data)
 display (fig)
 
 print(f"Visualization saved as SVG at: {svg_file_path}")

 return valid_smiles 

def plot_MSE_over_SMILES (df_design,N_max=24,
 lower_bound = 0.0,
 upper_bound = 0.08, SMILES_dir='./', target='', ):
 
 if not os.path.exists(SMILES_dir):
 os.makedirs(SMILES_dir) 
 df_sorted = df_design[:N_max].sort_values('MSE',ascending=False).reset_index(drop=True)

 
 df_plot=df_sorted[(df_sorted['MSE'] > lower_bound) & (df_sorted['MSE'] < upper_bound)]
 
 # Plot MSE against the index (which now corresponds to the ranking)
 fig, ax = plt.subplots(figsize=(8, 7))
 plt.plot(df_plot['SMILES'], df_plot['MSE'], 'o-', markersize=5, ) # 'o' for circular markers
 
 # Improving the plot aesthetics
 plt.xticks(rotation=90) # Rotate the x-axis labels for better readability
 plt.xlabel('Molecule SMILES')
 plt.ylabel('MSE')
 #plt.title('Ordered from Best to Worst')
 plt.tight_layout() # Adjust the layout to make room for the rotated x-axis labels
 plt.savefig(SMILES_dir+f'SMILES_over_MSE_{target}.svg', format='svg')
 plt.show()
 
def visualize_over_SMILES (df_design,N_max=24,per_row=20,SMILES_dir='./',
 lower_bound = 0.0,
 upper_bound = 0.08, target='', remove_duplicates=True):

 if remove_duplicates:
 # Example: Keep the entry with the best MSE among the novel molecules for each SMILES
 df_design = df_design.sort_values(['MSE', 'is_novel', 'SMILES', ], ascending=[True, False, True]) \
 .drop_duplicates(subset='SMILES', keep='first')

 df_design.reset_index(drop=True, inplace=True)
 df_design.to_csv(f'{SMILES_dir}/sorted_noduplicates_{N_max}.csv', index=False)
 
 valid_smiles=visualize_smiles_and_save(list(df_design['SMILES'][:N_max]), per_row=per_row, dir_path=SMILES_dir, root=f'{target}')
 
 smiles_df = pd.DataFrame(valid_smiles, columns=["SMILES"])

 # Save the DataFrame to a CSV file
 file_path = "/smiles_data.csv"
 smiles_df.to_csv(f'{SMILES_dir}/valid_SMILES_{N_max}.csv', index=False )
 
 fig, ax = plt.subplots(figsize=(8, 5))
 
 df_plot=df_design[(df_design['MSE'] > lower_bound) & (df_design['MSE'] < upper_bound)]
 df_plot.plot(kind='kde', color='darkblue', label='KDE', ax=ax)
 
 # Plot histogram with density=True for probability density representation
 plt.hist(df_design['MSE'], density=True, alpha=0.5, color='skyblue', label='Histogram',bins=50, 
 range=[lower_bound,upper_bound]
 )
 plt.xlim(lower_bound, upper_bound)
 plt.title('Density and Histogram Plot of MSE')
 plt.xlabel('MSE')
 plt.ylabel('Density')
 
 # Adding a legend to distinguish between the KDE and Histogram
 plt.legend()
 
 plt.savefig(SMILES_dir+f'mse_histogram_{target}.svg', format='svg')
 plt.show()

 plot_MSE_over_SMILES (df_design,N_max=N_max,
 lower_bound = lower_bound,
 upper_bound = upper_bound, target=target,SMILES_dir=SMILES_dir)
 
 return df_design

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas.plotting import parallel_coordinates

def plot_change_in_design(original, labels, target, SMILES_dir='./'):
 if not os.path.exists(SMILES_dir):
 os.makedirs(SMILES_dir)
 
 # Create a DataFrame to hold the original and target vectors with labels
 df = pd.DataFrame([original, target], columns=labels)
 df['Version'] = ['Original', 'Target'] # Add a 'Version' column for coloring
 
 # Plotting
 plt.figure(figsize=(7, 4))
 parallel_coordinates(df, 'Version', color=['blue', 'red'])
 plt.title('Original vs Target Values across Properties')
 plt.xticks(rotation=45)
 plt.tight_layout()
 
 # Annotating changes with thicker arrows pointing towards the target
 for i, label in enumerate(labels):
 if original[i] < target[i]: # If the target value is greater, arrow points upwards
 plt.annotate('', xy=(i, target[i]), xytext=(i, original[i]),
 arrowprops=dict(arrowstyle="->", color='black', lw=2))
 else: # If the target value is lesser, arrow points downwards
 plt.annotate('', xy=(i, target[i]), xytext=(i, original[i]),
 arrowprops=dict(arrowstyle="->", color='black', lw=2))
 
 # Save the plot as an SVG file in the specified directory
 plt.savefig(SMILES_dir + "parallel_coordinates_changes_direction.svg", format="svg")
 
 plt.show()
 
def visualize_pred_vs_target (target, best_predicted, labels, dir_path='./', best_SMILES='',sample_count=0): 
 if not os.path.exists(dir_path):
 os.makedirs(dir_path)
 sns.set_style("whitegrid")
 plt.gcf().set_facecolor('white')
 
 x = np.arange(len(labels)) # Label locations
 width = 0.35 # Width of the bars
 
 fig, ax = plt.subplots(figsize=(9, 5))
 rects1 = ax.bar(x - width/2, target, width, label='Target')
 rects2 = ax.bar(x + width/2, best_predicted, width, label='Predicted properties')
 
 # Add some text for labels, title and custom x-axis tick labels, etc.
 ax.set_ylabel('Values')
 ax.set_title(f'Comparison of Target and Predicted Properties, {best_SMILES}')
 ax.set_xticks(x)
 ax.set_xticklabels(labels, rotation=45, ha="right")
 ax.legend()
 fig.tight_layout()
 plt.savefig(f"{dir_path}/QM9_best_design_{target}_barplot_{sample_count}.svg")
 plt.show()
 #plt.show()

from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem import AllChem, rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
 
def prime_messages (SMILES_chitin_monomer, target, N=1):
 messages=[]
 for i in range (N):
 
 line=f'GenerateMolecularProperties<{return_str( target)}>'
 messages.append ({"role": "user", "content": line}, )
 line=f'[{SMILES_chitin_monomer}]'
 messages.append ({"role": "assistant", "content": line}, )
 
 return messages

from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_3d(smiles, num_confs=100):
 mol = Chem.MolFromSmiles(smiles)
 if mol is None:
 print("Failed to create molecule from SMILES")
 return None

 mol = Chem.AddHs(mol)
 params = AllChem.ETKDGv3()
 params.randomSeed = 42
 if not AllChem.EmbedMultipleConfs(mol, numConfs=num_confs, params=params):
 print("Embedding conformations failed.")
 return None

 results = []
 for conf_id in range(num_confs):
 ff = AllChem.MMFFGetMoleculeForceField(mol, AllChem.MMFFGetMoleculeProperties(mol), confId=conf_id)
 if ff is None:
 print(f"Failed to setup MMFF for conformer {conf_id}")
 continue
 energy = ff.Minimize()
 results.append((conf_id, ff.CalcEnergy()))

 if not results:
 print("No successful energy minimization.")
 return None
 

 best_conf = mol.GetConformer(min_energy_conf[0])
 best_mol = Chem.Mol(mol)
 best_mol.RemoveAllConformers()
 best_mol.AddConformer(best_conf, assignId=True)

 coords = best_conf.GetPositions()
 atom_symbols = [atom.GetSymbol() for atom in best_mol.GetAtoms()]
 geometry = '\n'.join(f'{atom} {coord[0]} {coord[1]} {coord[2]}' for atom, coord in zip(atom_symbols, coords))

 display (best_mol)
 
 return geometry, best_mol

### Property calculation as possible starting point for design iterations 

In [None]:
SMILES_START='O1C2C3OC2C13'
properties,_=avg_properties_from_SMILES (model, tokenizer, SMILES_START, SMILES_dir=SMILES_dir,
 temperature=0.3, top_k=256,top_p=0.9, num_beams=1, repetition_penalty=1.,
 labels=labels, N_prop=3, plot_results=True)


In [None]:
# Retrieve the scaling parameters
data_min = scaler.data_min_
data_max = scaler.data_max_
scale = scaler.scale_
feature_min = scaler.min_

print("Feature Scaling Parameters:")
print("{:<20} {:<20} {:<20} {:<20}".format("Feature Index", "Min Value", "Max Value", "Scale Factor"))
for i in range(len(data_min)):
 print("{:<20} {:<20} {:<20} {:<20}".format(i, data_min[i], data_max[i], scale[i]))

print("\nPer-feature Shifts (Min):")
for i, min_val in enumerate(feature_min):
 print("Feature {}: {:.6f}".format(i, min_val))

### Molecular design: Iterative solution 

In [None]:
import copy 
properties=y_test[4]

#Create new set of properties based on existing molecule (from test set)
properties_new=copy.deepcopy (properties)
properties_new[0]=properties[0]+0.2
properties_new[1]=properties[1]+0.2
plot_change_in_design (properties, labels, properties_new,SMILES_dir)

In [None]:
df_design=design_molecule_loop (model, tokenizer, np.array(properties_new), SMILES_LIST=SMILES_LIST, dir_path=SMILES_dir,
 temperature_pred=0.1, temperature_gen=0.3, top_k=32,top_p=0.1, repetition_penalty=1.,
 threshold=0.001, N_max=64, 
 N_attempts_for_forward=6,
 )

In [None]:
visualize_over_SMILES (df_design,N_max=30,SMILES_dir=SMILES_dir,per_row=5,
 lower_bound = 0.0, remove_duplicates=True,
 upper_bound = 0.02, target=np.array(properties_new))

target=np.array(properties_new)
best_SMILES, best_predicted, best_mse, is_novel = df_design_2.iloc[5]

print ("Best SILES: ", best_SMILES)
print (f"{best_SMILES} is novel: {is_novel}")

sns.set_style("whitegrid")

visualize_pred_vs_target (target, best_predicted, labels, dir_path=SMILES_dir, best_SMILES=best_SMILES,sample_count=0)
 
visualize_SMILES(best_SMILES, dir_path=SMILES_dir, root=f'{target}_BEST')

In [None]:
target=np.array(properties_new)
best_SMILES, best_predicted, best_mse, is_novel = df_design_2.iloc[5]

print ("Best SILES: ", best_SMILES)
print (f"{best_SMILES} is novel: {is_novel}")

sns.set_style("whitegrid")

visualize_pred_vs_target (target, best_predicted, labels, dir_path=SMILES_dir, best_SMILES=best_SMILES,sample_count=0)
 
visualize_SMILES(best_SMILES, dir_path=SMILES_dir, root=f'{target}_BEST')