Untitled

mail@pastecode.io avatar
unknown
plain_text
18 days ago
7.0 kB
5
Indexable
Never
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

import os
import tempfile

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

plt.style.use('seaborn-whitegrid')

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

movies_metadata = pd.read_csv('/content/drive/My Drive/movies_metadata.csv')
ratings=pd.read_csv('/content/drive/My Drive/ratings.csv')

ratings = ratings[['userId', 'movieId','timestamp']].rename(columns={'movieId': 'movie_id', 'userId': 'user_id'})
movies_metadata = movies_metadata[['id', 'title']].rename(columns={'id': 'movie_id', 'title': 'movie_title'})
ratings['movie_id'] = ratings['movie_id'].astype(str)
movies_metadata['movie_id'] = movies_metadata['movie_id'].astype(str)
combined_dataset = pd.merge(ratings, movies_metadata, on='movie_id', how='inner')
combined_dataset= combined_dataset.sample(100_000,random_state=1)
import numpy as np

# Create a TensorFlow dataset
combined_dataset_tf = tf.data.Dataset.from_tensor_slices({
    'user_id': combined_dataset['user_id'].astype(str).values,
    'movie_title': combined_dataset['movie_title'].astype(str).values,
    'timestamp': combined_dataset['timestamp'].values,
    # 'genres': genres_encoded  # This now should be an array with consistent dimensions
})

movies = combined_dataset_tf.map(lambda x: x["movie_title"])

timestamps = np.concatenate(list(combined_dataset_tf.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)
movie_titles = movies.batch(1_000)
user_ids = combined_dataset_tf.batch(1_000_000).map(lambda x: x["user_id"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

class UserModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
    ])
    self.timestamp_embedding = tf.keras.Sequential([
        tf.keras.layers.Discretization(timestamp_buckets.tolist()),
        tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
    ])
    self.normalized_timestamp = tf.keras.layers.Normalization(
        axis=None
    )

    self.normalized_timestamp.adapt(timestamps)

  def call(self, inputs):
    user_embeds = self.user_embedding(inputs["user_id"])
    timestamp_embeds = self.timestamp_embedding(inputs["timestamp"])
    normalized_time = tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1))

    return tf.concat([
        user_embeds,
        timestamp_embeds,
        normalized_time
    ], axis=1)



class QueryModel(tf.keras.Model):
  """Model for encoding user queries."""

  def __init__(self, layer_sizes):
    """Model for encoding user queries.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    # We first use the user model for generating embeddings.
    self.embedding_model = UserModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

from typing import Dict, Text

class MovieModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
    ])

    self.title_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)

    self.title_text_embedding = tf.keras.Sequential([
      self.title_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    # Adapt the title_vectorizer using the movie titles
    self.title_vectorizer.adapt(movies)

  def call(self, titles):
    return tf.concat([
        self.title_embedding(titles),
        self.title_text_embedding(titles),
    ], axis=1)


class CandidateModel(tf.keras.Model):

  def __init__(self, layer_sizes):

    super().__init__()

    self.embedding_model = MovieModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

class MovielensModel(tfrs.models.Model):

  def __init__(self, layer_sizes):
    super().__init__()
    self.query_model = QueryModel(layer_sizes)
    self.candidate_model = CandidateModel(layer_sizes)
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    query_embeddings = self.query_model({
        "user_id": features["user_id"],
        "timestamp": features["timestamp"]
    })
    movie_embeddings = self.candidate_model(features["movie_title"])

    return self.task(
        query_embeddings, movie_embeddings, compute_metrics=not training)

tf.random.set_seed(1)
shuffled = combined_dataset_tf.shuffle(100_000, seed=1, reshuffle_each_iteration=False) #aici trb modificat pt mn!!

train = shuffled.take(80_000)
eval = shuffled.skip(80_000).take(10_000)
test = shuffled.skip(90_000).take(10_000)

cached_train = train.shuffle(100_000).batch(4096)
cached_train = train.batch(4096)
cached_eval = eval.batch(4096).cache()
cached_test = test.batch(4096).cache()

num_epochs = 3

model = MovielensModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    epochs=num_epochs)

accuracy = one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")
Leave a Comment