Untitled
unknown
plain_text
25 days ago
3.6 kB
2
Indexable
Never
!pip install rdkit-pypi !pip install tensorflow !pip install keras !pip install numpy !pip install pandas !pip install scikit-learn import numpy as np import pandas as pd from rdkit import Chem from rdkit.Chem import Draw from tensorflow.keras import layers, models, optimizers, backend as K from sklearn.model_selection import train_test_split from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.utils import to_categorical # Example dataset of SMILES strings smiles_list = ['CCO', 'CCN', 'CCC', 'CCCl', 'COC', 'C1CCCCC1'] # Define vocabulary from SMILES charset = set("".join(smiles_list)) charset.add(" ") # Add padding character char_to_int = {char: i for i, char in enumerate(sorted(charset))} int_to_char = {i: char for char, i in char_to_int.items()} # Encode SMILES to integer sequences max_length = max(len(smiles) for smiles in smiles_list) encoded_smiles = [[char_to_int[char] for char in smiles] for smiles in smiles_list] encoded_smiles = pad_sequences(encoded_smiles, maxlen=max_length, padding='post', value=char_to_int[" "]) # One-hot encode one_hot_encoded_smiles = np.array([to_categorical(smile, num_classes=len(charset)) for smile in encoded_smiles]) # Split data into training and testing sets X_train, X_test = train_test_split(one_hot_encoded_smiles, test_size=0.2, random_state=42) # Define VAE parameters input_shape = X_train.shape[1:] # (max_length, len(charset)) latent_dim = 2 # Reduced for simplicity in the example # Encoder inputs = layers.Input(shape=input_shape) x = layers.LSTM(128, return_sequences=False)(inputs) z_mean = layers.Dense(latent_dim)(x) z_log_var = layers.Dense(latent_dim)(x) def sampling(args): z_mean, z_log_var = args epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim)) return z_mean + K.exp(0.5 * z_log_var) * epsilon z = layers.Lambda(sampling)([z_mean, z_log_var]) # Decoder decoder_input = layers.Input(shape=(latent_dim,)) x = layers.RepeatVector(max_length)(decoder_input) x = layers.LSTM(128, return_sequences=True)(x) decoded = layers.TimeDistributed(layers.Dense(len(charset), activation='softmax'))(x) # Models encoder = models.Model(inputs, z_mean) decoder = models.Model(decoder_input, decoded) outputs = decoder(encoder(inputs)) vae = models.Model(inputs, outputs) # Loss Function def vae_loss(inputs, outputs): xent_loss = K.sum(K.categorical_crossentropy(inputs, outputs), axis=-1) kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) return K.mean(xent_loss + kl_loss) vae.compile(optimizer='rmsprop', loss=vae_loss) vae.summary() # Train the VAE model epochs = 50 # Adjust as needed batch_size = 32 vae.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, X_test)) # Sample new points in latent space num_molecules = 10 new_latent_points = np.random.normal(size=(num_molecules, latent_dim)) # Decode to SMILES generated_smiles = decoder.predict(new_latent_points) # Convert one-hot encoded sequences back to SMILES def decode_smiles(encoded): smiles = "" for vec in encoded: index = np.argmax(vec) smiles += int_to_char[index] return smiles.strip() # Convert generated sequences to SMILES decoded_smiles = [decode_smiles(smile) for smile in generated_smiles] print("Generated SMILES:") for smile in decoded_smiles: print(smile) # Visualize generated molecules for smile in decoded_smiles: mol = Chem.MolFromSmiles(smile) if mol: display(Draw.MolToImage(mol))
Leave a Comment