Source code for recommenders.models.deeprec.models.graphrec.lightgcn

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.

import tensorflow as tf
import time
import os
import sys
import numpy as np
import pandas as pd
from recommenders.evaluation.python_evaluation import (
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
)
from recommenders.utils.python_utils import get_top_k_scored_items

tf.compat.v1.disable_eager_execution()  # need to disable eager in TF2.x


[docs]class LightGCN(object): """LightGCN model :Citation: He, Xiangnan, Kuan Deng, Xiang Wang, Yan Li, Yongdong Zhang, and Meng Wang. "LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation." arXiv preprint arXiv:2002.02126, 2020. """
[docs] def __init__(self, hparams, data, seed=None): """Initializing the model. Create parameters, placeholders, embeddings and loss function. Args: hparams (HParams): A HParams object, hold the entire set of hyperparameters. data (object): A recommenders.models.deeprec.DataModel.ImplicitCF object, load and process data. seed (int): Seed. """ tf.compat.v1.set_random_seed(seed) np.random.seed(seed) self.data = data self.epochs = hparams.epochs self.lr = hparams.learning_rate self.emb_dim = hparams.embed_size self.batch_size = hparams.batch_size self.n_layers = hparams.n_layers self.decay = hparams.decay self.eval_epoch = hparams.eval_epoch self.top_k = hparams.top_k self.save_model = hparams.save_model self.save_epoch = hparams.save_epoch self.metrics = hparams.metrics self.model_dir = hparams.MODEL_DIR metric_options = ["map", "ndcg", "precision", "recall"] for metric in self.metrics: if metric not in metric_options: raise ValueError( "Wrong metric(s), please select one of this list: {}".format( metric_options ) ) self.norm_adj = data.get_norm_adj_mat() self.n_users = data.n_users self.n_items = data.n_items self.users = tf.compat.v1.placeholder(tf.int32, shape=(None,)) self.pos_items = tf.compat.v1.placeholder(tf.int32, shape=(None,)) self.neg_items = tf.compat.v1.placeholder(tf.int32, shape=(None,)) self.weights = self._init_weights() self.ua_embeddings, self.ia_embeddings = self._create_lightgcn_embed() self.u_g_embeddings = tf.nn.embedding_lookup( params=self.ua_embeddings, ids=self.users ) self.pos_i_g_embeddings = tf.nn.embedding_lookup( params=self.ia_embeddings, ids=self.pos_items ) self.neg_i_g_embeddings = tf.nn.embedding_lookup( params=self.ia_embeddings, ids=self.neg_items ) self.u_g_embeddings_pre = tf.nn.embedding_lookup( params=self.weights["user_embedding"], ids=self.users ) self.pos_i_g_embeddings_pre = tf.nn.embedding_lookup( params=self.weights["item_embedding"], ids=self.pos_items ) self.neg_i_g_embeddings_pre = tf.nn.embedding_lookup( params=self.weights["item_embedding"], ids=self.neg_items ) self.batch_ratings = tf.matmul( self.u_g_embeddings, self.pos_i_g_embeddings, transpose_a=False, transpose_b=True, ) self.mf_loss, self.emb_loss = self._create_bpr_loss( self.u_g_embeddings, self.pos_i_g_embeddings, self.neg_i_g_embeddings ) self.loss = self.mf_loss + self.emb_loss self.opt = tf.compat.v1.train.AdamOptimizer(learning_rate=self.lr).minimize( self.loss ) self.saver = tf.compat.v1.train.Saver(max_to_keep=1) gpu_options = tf.compat.v1.GPUOptions(allow_growth=True) self.sess = tf.compat.v1.Session( config=tf.compat.v1.ConfigProto(gpu_options=gpu_options) ) self.sess.run(tf.compat.v1.global_variables_initializer())
def _init_weights(self): """Initialize user and item embeddings. Returns: dict: With keys `user_embedding` and `item_embedding`, embeddings of all users and items. """ all_weights = dict() initializer = tf.compat.v1.keras.initializers.VarianceScaling( scale=1.0, mode="fan_avg", distribution="uniform" ) all_weights["user_embedding"] = tf.Variable( initializer([self.n_users, self.emb_dim]), name="user_embedding" ) all_weights["item_embedding"] = tf.Variable( initializer([self.n_items, self.emb_dim]), name="item_embedding" ) print("Using xavier initialization.") return all_weights def _create_lightgcn_embed(self): """Calculate the average embeddings of users and items after every layer of the model. Returns: tf.Tensor, tf.Tensor: Average user embeddings. Average item embeddings. """ A_hat = self._convert_sp_mat_to_sp_tensor(self.norm_adj) ego_embeddings = tf.concat( [self.weights["user_embedding"], self.weights["item_embedding"]], axis=0 ) all_embeddings = [ego_embeddings] for k in range(0, self.n_layers): ego_embeddings = tf.sparse.sparse_dense_matmul(A_hat, ego_embeddings) all_embeddings += [ego_embeddings] all_embeddings = tf.stack(all_embeddings, 1) all_embeddings = tf.reduce_mean( input_tensor=all_embeddings, axis=1, keepdims=False ) u_g_embeddings, i_g_embeddings = tf.split( all_embeddings, [self.n_users, self.n_items], 0 ) return u_g_embeddings, i_g_embeddings def _create_bpr_loss(self, users, pos_items, neg_items): """Calculate BPR loss. Args: users (tf.Tensor): User embeddings to calculate loss. pos_items (tf.Tensor): Positive item embeddings to calculate loss. neg_items (tf.Tensor): Negative item embeddings to calculate loss. Returns: tf.Tensor, tf.Tensor: Matrix factorization loss. Embedding regularization loss. """ pos_scores = tf.reduce_sum(input_tensor=tf.multiply(users, pos_items), axis=1) neg_scores = tf.reduce_sum(input_tensor=tf.multiply(users, neg_items), axis=1) regularizer = ( tf.nn.l2_loss(self.u_g_embeddings_pre) + tf.nn.l2_loss(self.pos_i_g_embeddings_pre) + tf.nn.l2_loss(self.neg_i_g_embeddings_pre) ) regularizer = regularizer / self.batch_size mf_loss = tf.reduce_mean( input_tensor=tf.nn.softplus(-(pos_scores - neg_scores)) ) emb_loss = self.decay * regularizer return mf_loss, emb_loss def _convert_sp_mat_to_sp_tensor(self, X): """Convert a scipy sparse matrix to tf.SparseTensor. Returns: tf.SparseTensor: SparseTensor after conversion. """ coo = X.tocoo().astype(np.float32) indices = np.mat([coo.row, coo.col]).transpose() return tf.SparseTensor(indices, coo.data, coo.shape)
[docs] def fit(self): """Fit the model on self.data.train. If eval_epoch is not -1, evaluate the model on `self.data.test` every `eval_epoch` epoch to observe the training status. """ for epoch in range(1, self.epochs + 1): train_start = time.time() loss, mf_loss, emb_loss = 0.0, 0.0, 0.0 n_batch = self.data.train.shape[0] // self.batch_size + 1 for idx in range(n_batch): users, pos_items, neg_items = self.data.train_loader(self.batch_size) _, batch_loss, batch_mf_loss, batch_emb_loss = self.sess.run( [self.opt, self.loss, self.mf_loss, self.emb_loss], feed_dict={ self.users: users, self.pos_items: pos_items, self.neg_items: neg_items, }, ) loss += batch_loss / n_batch mf_loss += batch_mf_loss / n_batch emb_loss += batch_emb_loss / n_batch if np.isnan(loss): print("ERROR: loss is nan.") sys.exit() train_end = time.time() train_time = train_end - train_start if self.save_model and epoch % self.save_epoch == 0: save_path_str = os.path.join(self.model_dir, "epoch_" + str(epoch)) if not os.path.exists(save_path_str): os.makedirs(save_path_str) checkpoint_path = self.saver.save( # noqa: F841 sess=self.sess, save_path=save_path_str ) print("Save model to path {0}".format(os.path.abspath(save_path_str))) if self.eval_epoch == -1 or epoch % self.eval_epoch != 0: print( "Epoch %d (train)%.1fs: train loss = %.5f = (mf)%.5f + (embed)%.5f" % (epoch, train_time, loss, mf_loss, emb_loss) ) else: eval_start = time.time() ret = self.run_eval() eval_end = time.time() eval_time = eval_end - eval_start print( "Epoch %d (train)%.1fs + (eval)%.1fs: train loss = %.5f = (mf)%.5f + (embed)%.5f, %s" % ( epoch, train_time, eval_time, loss, mf_loss, emb_loss, ", ".join( metric + " = %.5f" % (r) for metric, r in zip(self.metrics, ret) ), ) )
[docs] def load(self, model_path=None): """Load an existing model. Args: model_path: Model path. Raises: IOError: if the restore operation failed. """ try: self.saver.restore(self.sess, model_path) except Exception: raise IOError( "Failed to find any matching files for {0}".format(model_path) )
[docs] def run_eval(self): """Run evaluation on self.data.test. Returns: dict: Results of all metrics in `self.metrics`. """ topk_scores = self.recommend_k_items( self.data.test, top_k=self.top_k, use_id=True ) ret = [] for metric in self.metrics: if metric == "map": ret.append(map_at_k(self.data.test, topk_scores, k=self.top_k)) elif metric == "ndcg": ret.append(ndcg_at_k(self.data.test, topk_scores, k=self.top_k)) elif metric == "precision": ret.append(precision_at_k(self.data.test, topk_scores, k=self.top_k)) elif metric == "recall": ret.append(recall_at_k(self.data.test, topk_scores, k=self.top_k)) return ret
[docs] def score(self, user_ids, remove_seen=True): """Score all items for test users. Args: user_ids (np.array): Users to test. remove_seen (bool): Flag to remove items seen in training from recommendation. Returns: numpy.ndarray: Value of interest of all items for the users. """ if any(np.isnan(user_ids)): raise ValueError( "LightGCN cannot score users that are not in the training set" ) u_batch_size = self.batch_size n_user_batchs = len(user_ids) // u_batch_size + 1 test_scores = [] for u_batch_id in range(n_user_batchs): start = u_batch_id * u_batch_size end = (u_batch_id + 1) * u_batch_size user_batch = user_ids[start:end] item_batch = range(self.data.n_items) rate_batch = self.sess.run( self.batch_ratings, {self.users: user_batch, self.pos_items: item_batch} ) test_scores.append(np.array(rate_batch)) test_scores = np.concatenate(test_scores, axis=0) if remove_seen: test_scores += self.data.R.tocsr()[user_ids, :] * -np.inf return test_scores
[docs] def recommend_k_items( self, test, top_k=10, sort_top_k=True, remove_seen=True, use_id=False ): """Recommend top K items for all users in the test set. Args: test (pandas.DataFrame): Test data. top_k (int): Number of top items to recommend. sort_top_k (bool): Flag to sort top k results. remove_seen (bool): Flag to remove items seen in training from recommendation. Returns: pandas.DataFrame: Top k recommendation items for each user. """ data = self.data if not use_id: user_ids = np.array([data.user2id[x] for x in test[data.col_user].unique()]) else: user_ids = np.array(test[data.col_user].unique()) test_scores = self.score(user_ids, remove_seen=remove_seen) top_items, top_scores = get_top_k_scored_items( scores=test_scores, top_k=top_k, sort_top_k=sort_top_k ) df = pd.DataFrame( { data.col_user: np.repeat( test[data.col_user].drop_duplicates().values, top_items.shape[1] ), data.col_item: top_items.flatten() if use_id else [data.id2item[item] for item in top_items.flatten()], data.col_prediction: top_scores.flatten(), } ) return df.replace(-np.inf, np.nan).dropna()
def output_embeddings(self, idmapper, n, target, user_file): embeddings = list(target.eval(session=self.sess)) with open(user_file, "w") as wt: for i in range(n): wt.write( "{0}\t{1}\n".format( idmapper[i], " ".join([str(a) for a in embeddings[i]]) ) )
[docs] def infer_embedding(self, user_file, item_file): """Export user and item embeddings to csv files. Args: user_file (str): Path of file to save user embeddings. item_file (str): Path of file to save item embeddings. """ # create output directories if they do not exist dirs, _ = os.path.split(user_file) if not os.path.exists(dirs): os.makedirs(dirs) dirs, _ = os.path.split(item_file) if not os.path.exists(dirs): os.makedirs(dirs) data = self.data self.output_embeddings( data.id2user, self.n_users, self.ua_embeddings, user_file ) self.output_embeddings( data.id2item, self.n_items, self.ia_embeddings, item_file )