Untitled
unknown
plain_text
a year ago
7.0 kB
8
Indexable
!pip install -q tensorflow-recommenders !pip install -q --upgrade tensorflow-datasets import os import tempfile %matplotlib inline import matplotlib.pyplot as plt import numpy as np import tensorflow as tf import tensorflow_datasets as tfds import tensorflow_recommenders as tfrs plt.style.use('seaborn-whitegrid') from google.colab import drive drive.mount('/content/drive') import pandas as pd movies_metadata = pd.read_csv('/content/drive/My Drive/movies_metadata.csv') ratings=pd.read_csv('/content/drive/My Drive/ratings.csv') ratings = ratings[['userId', 'movieId','timestamp']].rename(columns={'movieId': 'movie_id', 'userId': 'user_id'}) movies_metadata = movies_metadata[['id', 'title']].rename(columns={'id': 'movie_id', 'title': 'movie_title'}) ratings['movie_id'] = ratings['movie_id'].astype(str) movies_metadata['movie_id'] = movies_metadata['movie_id'].astype(str) combined_dataset = pd.merge(ratings, movies_metadata, on='movie_id', how='inner') combined_dataset= combined_dataset.sample(100_000,random_state=1) import numpy as np # Create a TensorFlow dataset combined_dataset_tf = tf.data.Dataset.from_tensor_slices({ 'user_id': combined_dataset['user_id'].astype(str).values, 'movie_title': combined_dataset['movie_title'].astype(str).values, 'timestamp': combined_dataset['timestamp'].values, # 'genres': genres_encoded # This now should be an array with consistent dimensions }) movies = combined_dataset_tf.map(lambda x: x["movie_title"]) timestamps = np.concatenate(list(combined_dataset_tf.map(lambda x: x["timestamp"]).batch(100))) max_timestamp = timestamps.max() min_timestamp = timestamps.min() timestamp_buckets = np.linspace( min_timestamp, max_timestamp, num=1000, ) movie_titles = movies.batch(1_000) user_ids = combined_dataset_tf.batch(1_000_000).map(lambda x: x["user_id"]) unique_movie_titles = np.unique(np.concatenate(list(movie_titles))) unique_user_ids = np.unique(np.concatenate(list(user_ids))) class UserModel(tf.keras.Model): def __init__(self): super().__init__() self.user_embedding = tf.keras.Sequential([ tf.keras.layers.StringLookup( vocabulary=unique_user_ids, mask_token=None), tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32), ]) self.timestamp_embedding = tf.keras.Sequential([ tf.keras.layers.Discretization(timestamp_buckets.tolist()), tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32), ]) self.normalized_timestamp = tf.keras.layers.Normalization( axis=None ) self.normalized_timestamp.adapt(timestamps) def call(self, inputs): user_embeds = self.user_embedding(inputs["user_id"]) timestamp_embeds = self.timestamp_embedding(inputs["timestamp"]) normalized_time = tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)) return tf.concat([ user_embeds, timestamp_embeds, normalized_time ], axis=1) class QueryModel(tf.keras.Model): """Model for encoding user queries.""" def __init__(self, layer_sizes): """Model for encoding user queries. Args: layer_sizes: A list of integers where the i-th entry represents the number of units the i-th layer contains. """ super().__init__() # We first use the user model for generating embeddings. self.embedding_model = UserModel() # Then construct the layers. self.dense_layers = tf.keras.Sequential() # Use the ReLU activation for all but the last layer. for layer_size in layer_sizes[:-1]: self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu")) # No activation for the last layer. for layer_size in layer_sizes[-1:]: self.dense_layers.add(tf.keras.layers.Dense(layer_size)) def call(self, inputs): feature_embedding = self.embedding_model(inputs) return self.dense_layers(feature_embedding) from typing import Dict, Text class MovieModel(tf.keras.Model): def __init__(self): super().__init__() max_tokens = 10_000 self.title_embedding = tf.keras.Sequential([ tf.keras.layers.StringLookup( vocabulary=unique_movie_titles, mask_token=None), tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32) ]) self.title_vectorizer = tf.keras.layers.TextVectorization( max_tokens=max_tokens) self.title_text_embedding = tf.keras.Sequential([ self.title_vectorizer, tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True), tf.keras.layers.GlobalAveragePooling1D(), ]) # Adapt the title_vectorizer using the movie titles self.title_vectorizer.adapt(movies) def call(self, titles): return tf.concat([ self.title_embedding(titles), self.title_text_embedding(titles), ], axis=1) class CandidateModel(tf.keras.Model): def __init__(self, layer_sizes): super().__init__() self.embedding_model = MovieModel() # Then construct the layers. self.dense_layers = tf.keras.Sequential() # Use the ReLU activation for all but the last layer. for layer_size in layer_sizes[:-1]: self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu")) # No activation for the last layer. for layer_size in layer_sizes[-1:]: self.dense_layers.add(tf.keras.layers.Dense(layer_size)) def call(self, inputs): feature_embedding = self.embedding_model(inputs) return self.dense_layers(feature_embedding) class MovielensModel(tfrs.models.Model): def __init__(self, layer_sizes): super().__init__() self.query_model = QueryModel(layer_sizes) self.candidate_model = CandidateModel(layer_sizes) self.task = tfrs.tasks.Retrieval( metrics=tfrs.metrics.FactorizedTopK( candidates=movies.batch(128).map(self.candidate_model), ), ) def compute_loss(self, features, training=False): query_embeddings = self.query_model({ "user_id": features["user_id"], "timestamp": features["timestamp"] }) movie_embeddings = self.candidate_model(features["movie_title"]) return self.task( query_embeddings, movie_embeddings, compute_metrics=not training) tf.random.set_seed(1) shuffled = combined_dataset_tf.shuffle(100_000, seed=1, reshuffle_each_iteration=False) #aici trb modificat pt mn!! train = shuffled.take(80_000) eval = shuffled.skip(80_000).take(10_000) test = shuffled.skip(90_000).take(10_000) cached_train = train.shuffle(100_000).batch(4096) cached_train = train.batch(4096) cached_eval = eval.batch(4096).cache() cached_test = test.batch(4096).cache() num_epochs = 3 model = MovielensModel([32]) model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1)) one_layer_history = model.fit( cached_train, validation_data=cached_test, epochs=num_epochs) accuracy = one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1] print(f"Top-100 accuracy: {accuracy:.2f}.")
Editor is loading...
Leave a Comment