Source code for recommenders.models.sar.sar_singlenode

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.

import numpy as np
import pandas as pd
import logging
from scipy import sparse

from recommenders.utils.python_utils import (
    cosine_similarity,
    inclusion_index,
    jaccard,
    lexicographers_mutual_information,
    lift,
    mutual_information,
    exponential_decay,
    get_top_k_scored_items,
    rescale,
)
from recommenders.utils import constants


SIM_COOCCUR = "cooccurrence"
SIM_COSINE = "cosine"
SIM_INCLUSION_INDEX = "inclusion index"
SIM_JACCARD = "jaccard"
SIM_LEXICOGRAPHERS_MUTUAL_INFORMATION = "lexicographers mutual information"
SIM_LIFT = "lift"
SIM_MUTUAL_INFORMATION = "mutual information"

logger = logging.getLogger()


[docs]class SARSingleNode: """Simple Algorithm for Recommendations (SAR) implementation SAR is a fast scalable adaptive algorithm for personalized recommendations based on user transaction history and items description. The core idea behind SAR is to recommend items like those that a user already has demonstrated an affinity to. It does this by 1) estimating the affinity of users for items, 2) estimating similarity across items, and then 3) combining the estimates to generate a set of recommendations for a given user. """
[docs] def __init__( self, col_user=constants.DEFAULT_USER_COL, col_item=constants.DEFAULT_ITEM_COL, col_rating=constants.DEFAULT_RATING_COL, col_timestamp=constants.DEFAULT_TIMESTAMP_COL, col_prediction=constants.DEFAULT_PREDICTION_COL, similarity_type=SIM_JACCARD, time_decay_coefficient=30, time_now=None, timedecay_formula=False, threshold=1, normalize=False, ): """Initialize model parameters Args: col_user (str): user column name col_item (str): item column name col_rating (str): rating column name col_timestamp (str): timestamp column name col_prediction (str): prediction column name similarity_type (str): ['cooccurrence', 'cosine', 'inclusion index', 'jaccard', 'lexicographers mutual information', 'lift', 'mutual information'] option for computing item-item similarity time_decay_coefficient (float): number of days till ratings are decayed by 1/2 time_now (int | None): current time for time decay calculation timedecay_formula (bool): flag to apply time decay threshold (int): item-item co-occurrences below this threshold will be removed normalize (bool): option for normalizing predictions to scale of original ratings """ self.col_rating = col_rating self.col_item = col_item self.col_user = col_user self.col_timestamp = col_timestamp self.col_prediction = col_prediction available_similarity_types = [ SIM_COOCCUR, SIM_COSINE, SIM_INCLUSION_INDEX, SIM_JACCARD, SIM_LIFT, SIM_MUTUAL_INFORMATION, SIM_LEXICOGRAPHERS_MUTUAL_INFORMATION, ] if similarity_type not in available_similarity_types: raise ValueError( 'Similarity type must be one of ["' + '" | "'.join(available_similarity_types) + '"]' ) self.similarity_type = similarity_type self.time_decay_half_life = ( time_decay_coefficient * 24 * 60 * 60 ) # convert to seconds self.time_decay_flag = timedecay_formula self.time_now = time_now self.threshold = threshold self.user_affinity = None self.item_similarity = None self.item_frequencies = None self.user_frequencies = None # threshold - items below this number get set to zero in co-occurrence counts if self.threshold <= 0: raise ValueError("Threshold cannot be < 1") # set flag to capture unity-rating user-affinity matrix for scaling scores self.normalize = normalize self.col_unity_rating = "_unity_rating" self.unity_user_affinity = None # column for mapping user / item ids to internal indices self.col_item_id = "_indexed_items" self.col_user_id = "_indexed_users" # obtain all the users and items from both training and test data self.n_users = None self.n_items = None # The min and max of the rating scale, obtained from the training data. self.rating_min = None self.rating_max = None # mapping for item to matrix element self.user2index = None self.item2index = None # the opposite of the above maps - map array index to actual string ID self.index2item = None self.index2user = None
[docs] def compute_affinity_matrix(self, df, rating_col): """Affinity matrix. The user-affinity matrix can be constructed by treating the users and items as indices in a sparse matrix, and the events as the data. Here, we're treating the ratings as the event weights. We convert between different sparse-matrix formats to de-duplicate user-item pairs, otherwise they will get added up. Args: df (pandas.DataFrame): Indexed df of users and items rating_col (str): Name of column to use for ratings Returns: sparse.csr: Affinity matrix in Compressed Sparse Row (CSR) format. """ return sparse.coo_matrix( (df[rating_col], (df[self.col_user_id], df[self.col_item_id])), shape=(self.n_users, self.n_items), ).tocsr()
[docs] def compute_time_decay(self, df, decay_column): """Compute time decay on provided column. Args: df (pandas.DataFrame): DataFrame of users and items decay_column (str): column to decay Returns: pandas.DataFrame: with column decayed """ # if time_now is None use the latest time if self.time_now is None: self.time_now = df[self.col_timestamp].max() # apply time decay to each rating df[decay_column] *= exponential_decay( value=df[self.col_timestamp], max_val=self.time_now, half_life=self.time_decay_half_life, ) # group time decayed ratings by user-item and take the sum as the user-item affinity return df.groupby([self.col_user, self.col_item]).sum().reset_index()
[docs] def compute_cooccurrence_matrix(self, df): """Co-occurrence matrix. The co-occurrence matrix is defined as :math:`C = U^T * U` where U is the user_affinity matrix with 1's as values (instead of ratings). Args: df (pandas.DataFrame): DataFrame of users and items Returns: numpy.ndarray: Co-occurrence matrix """ user_item_hits = sparse.coo_matrix( (np.repeat(1, df.shape[0]), (df[self.col_user_id], df[self.col_item_id])), shape=(self.n_users, self.n_items), ).tocsr() item_cooccurrence = user_item_hits.transpose().dot(user_item_hits) item_cooccurrence = item_cooccurrence.multiply( item_cooccurrence >= self.threshold ) return item_cooccurrence.astype(df[self.col_rating].dtype)
[docs] def set_index(self, df): """Generate continuous indices for users and items to reduce memory usage. Args: df (pandas.DataFrame): dataframe with user and item ids """ # generate a map of continuous index values to items self.index2item = dict(enumerate(df[self.col_item].unique())) self.index2user = dict(enumerate(df[self.col_user].unique())) # invert the mappings from above self.item2index = {v: k for k, v in self.index2item.items()} self.user2index = {v: k for k, v in self.index2user.items()} # set values for the total count of users and items self.n_users = len(self.user2index) self.n_items = len(self.index2item)
[docs] def fit(self, df): """Main fit method for SAR. .. note:: Please make sure that `df` has no duplicates. Args: df (pandas.DataFrame): User item rating dataframe (without duplicates). """ select_columns = [self.col_user, self.col_item, self.col_rating] if self.time_decay_flag: select_columns += [self.col_timestamp] if df[select_columns].duplicated().any(): raise ValueError("There should not be duplicates in the dataframe") # generate continuous indices if this hasn't been done if self.index2item is None: self.set_index(df) logger.info("Collecting user affinity matrix") if not np.issubdtype(df[self.col_rating].dtype, np.number): raise TypeError("Rating column data type must be numeric") # copy the DataFrame to avoid modification of the input temp_df = df[select_columns].copy() if self.time_decay_flag: logger.info("Calculating time-decayed affinities") temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_rating) logger.info("Creating index columns") # add mapping of user and item ids to indices temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].apply( lambda item: self.item2index.get(item, np.NaN) ) temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].apply( lambda user: self.user2index.get(user, np.NaN) ) if self.normalize: self.rating_min = temp_df[self.col_rating].min() self.rating_max = temp_df[self.col_rating].max() logger.info("Calculating normalization factors") temp_df[self.col_unity_rating] = 1.0 if self.time_decay_flag: temp_df = self.compute_time_decay( df=temp_df, decay_column=self.col_unity_rating ) self.unity_user_affinity = self.compute_affinity_matrix( df=temp_df, rating_col=self.col_unity_rating ) # affinity matrix logger.info("Building user affinity sparse matrix") self.user_affinity = self.compute_affinity_matrix( df=temp_df, rating_col=self.col_rating ) # calculate item co-occurrence logger.info("Calculating item co-occurrence") item_cooccurrence = self.compute_cooccurrence_matrix(df=temp_df) # free up some space del temp_df # creates an array with the frequency of every unique item self.item_frequencies = item_cooccurrence.diagonal() logger.info("Calculating item similarity") if self.similarity_type == SIM_COOCCUR: logger.info("Using co-occurrence based similarity") self.item_similarity = item_cooccurrence elif self.similarity_type == SIM_COSINE: logger.info("Using cosine similarity") self.item_similarity = cosine_similarity(item_cooccurrence) elif self.similarity_type == SIM_INCLUSION_INDEX: logger.info("Using inclusion index") self.item_similarity = inclusion_index(item_cooccurrence) elif self.similarity_type == SIM_JACCARD: logger.info("Using jaccard based similarity") self.item_similarity = jaccard(item_cooccurrence) elif self.similarity_type == SIM_LEXICOGRAPHERS_MUTUAL_INFORMATION: logger.info("Using lexicographers mutual information similarity") self.item_similarity = lexicographers_mutual_information(item_cooccurrence) elif self.similarity_type == SIM_LIFT: logger.info("Using lift based similarity") self.item_similarity = lift(item_cooccurrence) elif self.similarity_type == SIM_MUTUAL_INFORMATION: logger.info("Using mutual information similarity") self.item_similarity = mutual_information(item_cooccurrence) else: raise ValueError("Unknown similarity type: {}".format(self.similarity_type)) # free up some space del item_cooccurrence logger.info("Done training")
[docs] def score(self, test, remove_seen=False): """Score all items for test users. Args: test (pandas.DataFrame): user to test remove_seen (bool): flag to remove items seen in training from recommendation Returns: numpy.ndarray: Value of interest of all items for the users. """ # get user / item indices from test set user_ids = list( map( lambda user: self.user2index.get(user, np.NaN), test[self.col_user].unique(), ) ) if any(np.isnan(user_ids)): raise ValueError("SAR cannot score users that are not in the training set") # calculate raw scores with a matrix multiplication logger.info("Calculating recommendation scores") test_scores = self.user_affinity[user_ids, :].dot(self.item_similarity) # ensure we're working with a dense ndarray if isinstance(test_scores, sparse.spmatrix): test_scores = test_scores.toarray() if self.normalize: counts = self.unity_user_affinity[user_ids, :].dot(self.item_similarity) user_min_scores = ( np.tile(counts.min(axis=1)[:, np.newaxis], test_scores.shape[1]) * self.rating_min ) user_max_scores = ( np.tile(counts.max(axis=1)[:, np.newaxis], test_scores.shape[1]) * self.rating_max ) test_scores = rescale( test_scores, self.rating_min, self.rating_max, user_min_scores, user_max_scores, ) # remove items in the train set so recommended items are always novel if remove_seen: logger.info("Removing seen items") test_scores += self.user_affinity[user_ids, :] * -np.inf return test_scores
[docs] def get_popularity_based_topk(self, top_k=10, sort_top_k=True, items=True): """Get top K most frequently occurring items across all users. Args: top_k (int): number of top items to recommend. sort_top_k (bool): flag to sort top k results. items (bool): if false, return most frequent users instead Returns: pandas.DataFrame: top k most popular items. """ if items: frequencies = self.item_frequencies col = self.col_item idx = self.index2item else: if self.user_frequencies is None: self.user_frequencies = self.user_affinity.getnnz(axis=1).astype( "int64" ) frequencies = self.user_frequencies col = self.col_user idx = self.index2user test_scores = np.array([frequencies]) logger.info("Getting top K") top_components, top_scores = get_top_k_scored_items( scores=test_scores, top_k=top_k, sort_top_k=sort_top_k ) return pd.DataFrame( { col: [idx[item] for item in top_components.flatten()], self.col_prediction: top_scores.flatten(), } )
[docs] def get_item_based_topk(self, items, top_k=10, sort_top_k=True): """Get top K similar items to provided seed items based on similarity metric defined. This method will take a set of items and use them to recommend the most similar items to that set based on the similarity matrix fit during training. This allows recommendations for cold-users (unseen during training), note - the model is not updated. The following options are possible based on information provided in the items input: 1. Single user or seed of items: only item column (ratings are assumed to be 1) 2. Single user or seed of items w/ ratings: item column and rating column 3. Separate users or seeds of items: item and user column (user ids are only used to separate item sets) 4. Separate users or seeds of items with ratings: item, user and rating columns provided Args: items (pandas.DataFrame): DataFrame with item, user (optional), and rating (optional) columns top_k (int): number of top items to recommend sort_top_k (bool): flag to sort top k results Returns: pandas.DataFrame: sorted top k recommendation items """ # convert item ids to indices item_ids = np.asarray( list( map( lambda item: self.item2index.get(item, np.NaN), items[self.col_item].values, ) ) ) # if no ratings were provided assume they are all 1 if self.col_rating in items.columns: ratings = items[self.col_rating] else: ratings = pd.Series(np.ones_like(item_ids)) # create local map of user ids if self.col_user in items.columns: test_users = items[self.col_user] user2index = {x[1]: x[0] for x in enumerate(items[self.col_user].unique())} user_ids = test_users.map(user2index) else: # if no user column exists assume all entries are for a single user test_users = pd.Series(np.zeros_like(item_ids)) user_ids = test_users n_users = user_ids.drop_duplicates().shape[0] # generate pseudo user affinity using seed items pseudo_affinity = sparse.coo_matrix( (ratings, (user_ids, item_ids)), shape=(n_users, self.n_items) ).tocsr() # calculate raw scores with a matrix multiplication test_scores = pseudo_affinity.dot(self.item_similarity) # remove items in the seed set so recommended items are novel test_scores[user_ids, item_ids] = -np.inf top_items, top_scores = get_top_k_scored_items( scores=test_scores, top_k=top_k, sort_top_k=sort_top_k ) df = pd.DataFrame( { self.col_user: np.repeat( test_users.drop_duplicates().values, top_items.shape[1] ), self.col_item: [self.index2item[item] for item in top_items.flatten()], self.col_prediction: top_scores.flatten(), } ) # drop invalid items return df.replace(-np.inf, np.nan).dropna()
[docs] def get_topk_most_similar_users(self, user, top_k, sort_top_k=True): """Based on user affinity towards items, calculate the most similar users to the given user. Args: user (int): user to retrieve most similar users for top_k (int): number of top items to recommend sort_top_k (bool): flag to sort top k results Returns: pandas.DataFrame: top k most similar users and their scores """ user_idx = self.user2index[user] similarities = self.user_affinity[user_idx].dot(self.user_affinity.T).toarray() similarities[0, user_idx] = -np.inf top_items, top_scores = get_top_k_scored_items( scores=similarities, top_k=top_k, sort_top_k=sort_top_k ) df = pd.DataFrame( { self.col_user: [self.index2user[user] for user in top_items.flatten()], self.col_prediction: top_scores.flatten(), } ) # drop invalid items return df.replace(-np.inf, np.nan).dropna()
[docs] def recommend_k_items(self, test, top_k=10, sort_top_k=True, remove_seen=False): """Recommend top K items for all users which are in the test set Args: test (pandas.DataFrame): users to test top_k (int): number of top items to recommend sort_top_k (bool): flag to sort top k results remove_seen (bool): flag to remove items seen in training from recommendation Returns: pandas.DataFrame: top k recommendation items for each user """ test_scores = self.score(test, remove_seen=remove_seen) top_items, top_scores = get_top_k_scored_items( scores=test_scores, top_k=top_k, sort_top_k=sort_top_k ) df = pd.DataFrame( { self.col_user: np.repeat( test[self.col_user].drop_duplicates().values, top_items.shape[1] ), self.col_item: [self.index2item[item] for item in top_items.flatten()], self.col_prediction: top_scores.flatten(), } ) # drop invalid items return df.replace(-np.inf, np.nan).dropna()
[docs] def predict(self, test): """Output SAR scores for only the users-items pairs which are in the test set Args: test (pandas.DataFrame): DataFrame that contains users and items to test Returns: pandas.DataFrame: DataFrame contains the prediction results """ test_scores = self.score(test) user_ids = np.asarray( list( map( lambda user: self.user2index.get(user, np.NaN), test[self.col_user].values, ) ) ) # create mapping of new items to zeros item_ids = np.asarray( list( map( lambda item: self.item2index.get(item, np.NaN), test[self.col_item].values, ) ) ) nans = np.isnan(item_ids) if any(nans): logger.warning( "Items found in test not seen during training, new items will have score of 0" ) test_scores = np.append(test_scores, np.zeros((self.n_users, 1)), axis=1) item_ids[nans] = self.n_items item_ids = item_ids.astype("int64") df = pd.DataFrame( { self.col_user: test[self.col_user].values, self.col_item: test[self.col_item].values, self.col_prediction: test_scores[user_ids, item_ids], } ) return df