Untitled
unknown
plain_text
a year ago
3.6 kB
8
Indexable
!pip install rdkit-pypi
!pip install tensorflow
!pip install keras
!pip install numpy
!pip install pandas
!pip install scikit-learn
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from tensorflow.keras import layers, models, optimizers, backend as K
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
# Example dataset of SMILES strings
smiles_list = ['CCO', 'CCN', 'CCC', 'CCCl', 'COC', 'C1CCCCC1']
# Define vocabulary from SMILES
charset = set("".join(smiles_list))
charset.add(" ") # Add padding character
char_to_int = {char: i for i, char in enumerate(sorted(charset))}
int_to_char = {i: char for char, i in char_to_int.items()}
# Encode SMILES to integer sequences
max_length = max(len(smiles) for smiles in smiles_list)
encoded_smiles = [[char_to_int[char] for char in smiles] for smiles in smiles_list]
encoded_smiles = pad_sequences(encoded_smiles, maxlen=max_length, padding='post', value=char_to_int[" "])
# One-hot encode
one_hot_encoded_smiles = np.array([to_categorical(smile, num_classes=len(charset)) for smile in encoded_smiles])
# Split data into training and testing sets
X_train, X_test = train_test_split(one_hot_encoded_smiles, test_size=0.2, random_state=42)
# Define VAE parameters
input_shape = X_train.shape[1:] # (max_length, len(charset))
latent_dim = 2 # Reduced for simplicity in the example
# Encoder
inputs = layers.Input(shape=input_shape)
x = layers.LSTM(128, return_sequences=False)(inputs)
z_mean = layers.Dense(latent_dim)(x)
z_log_var = layers.Dense(latent_dim)(x)
def sampling(args):
z_mean, z_log_var = args
epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim))
return z_mean + K.exp(0.5 * z_log_var) * epsilon
z = layers.Lambda(sampling)([z_mean, z_log_var])
# Decoder
decoder_input = layers.Input(shape=(latent_dim,))
x = layers.RepeatVector(max_length)(decoder_input)
x = layers.LSTM(128, return_sequences=True)(x)
decoded = layers.TimeDistributed(layers.Dense(len(charset), activation='softmax'))(x)
# Models
encoder = models.Model(inputs, z_mean)
decoder = models.Model(decoder_input, decoded)
outputs = decoder(encoder(inputs))
vae = models.Model(inputs, outputs)
# Loss Function
def vae_loss(inputs, outputs):
xent_loss = K.sum(K.categorical_crossentropy(inputs, outputs), axis=-1)
kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
return K.mean(xent_loss + kl_loss)
vae.compile(optimizer='rmsprop', loss=vae_loss)
vae.summary()
# Train the VAE model
epochs = 50 # Adjust as needed
batch_size = 32
vae.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, X_test))
# Sample new points in latent space
num_molecules = 10
new_latent_points = np.random.normal(size=(num_molecules, latent_dim))
# Decode to SMILES
generated_smiles = decoder.predict(new_latent_points)
# Convert one-hot encoded sequences back to SMILES
def decode_smiles(encoded):
smiles = ""
for vec in encoded:
index = np.argmax(vec)
smiles += int_to_char[index]
return smiles.strip()
# Convert generated sequences to SMILES
decoded_smiles = [decode_smiles(smile) for smile in generated_smiles]
print("Generated SMILES:")
for smile in decoded_smiles:
print(smile)
# Visualize generated molecules
for smile in decoded_smiles:
mol = Chem.MolFromSmiles(smile)
if mol:
display(Draw.MolToImage(mol))
Editor is loading...
Leave a Comment