Source code for recommenders.models.sar.sar_singlenode

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.

import numpy as np
import pandas as pd
import logging
from scipy import sparse

from recommenders.utils.python_utils import (
    cosine_similarity,
    inclusion_index,
    jaccard,
    lexicographers_mutual_information,
    lift,
    mutual_information,
    exponential_decay,
    get_top_k_scored_items,
    rescale,
)
from recommenders.utils import constants


SIM_COOCCUR = "cooccurrence"
SIM_COSINE = "cosine"
SIM_INCLUSION_INDEX = "inclusion index"
SIM_JACCARD = "jaccard"
SIM_LEXICOGRAPHERS_MUTUAL_INFORMATION = "lexicographers mutual information"
SIM_LIFT = "lift"
SIM_MUTUAL_INFORMATION = "mutual information"

logger = logging.getLogger()


[docs]class SARSingleNode:
    """Simple Algorithm for Recommendations (SAR) implementation

    SAR is a fast scalable adaptive algorithm for personalized recommendations based on user transaction history
    and items description. The core idea behind SAR is to recommend items like those that a user already has
    demonstrated an affinity to. It does this by 1) estimating the affinity of users for items, 2) estimating
    similarity across items, and then 3) combining the estimates to generate a set of recommendations for a given user.
    """

[docs]    def __init__(
        self,
        col_user=constants.DEFAULT_USER_COL,
        col_item=constants.DEFAULT_ITEM_COL,
        col_rating=constants.DEFAULT_RATING_COL,
        col_timestamp=constants.DEFAULT_TIMESTAMP_COL,
        col_prediction=constants.DEFAULT_PREDICTION_COL,
        similarity_type=SIM_JACCARD,
        time_decay_coefficient=30,
        time_now=None,
        timedecay_formula=False,
        threshold=1,
        normalize=False,
    ):
        """Initialize model parameters

        Args:
            col_user (str): user column name
            col_item (str): item column name
            col_rating (str): rating column name
            col_timestamp (str): timestamp column name
            col_prediction (str): prediction column name
            similarity_type (str): ['cooccurrence', 'cosine', 'inclusion index', 'jaccard',
              'lexicographers mutual information', 'lift', 'mutual information'] option for
              computing item-item similarity
            time_decay_coefficient (float): number of days till ratings are decayed by 1/2
            time_now (int | None): current time for time decay calculation
            timedecay_formula (bool): flag to apply time decay
            threshold (int): item-item co-occurrences below this threshold will be removed
            normalize (bool): option for normalizing predictions to scale of original ratings
        """
        self.col_rating = col_rating
        self.col_item = col_item
        self.col_user = col_user
        self.col_timestamp = col_timestamp
        self.col_prediction = col_prediction

        available_similarity_types = [
            SIM_COOCCUR,
            SIM_COSINE,
            SIM_INCLUSION_INDEX,
            SIM_JACCARD,
            SIM_LIFT,
            SIM_MUTUAL_INFORMATION,
            SIM_LEXICOGRAPHERS_MUTUAL_INFORMATION,
        ]
        if similarity_type not in available_similarity_types:
            raise ValueError(
                'Similarity type must be one of ["'
                + '" | "'.join(available_similarity_types)
                + '"]'
            )
        self.similarity_type = similarity_type
        self.time_decay_half_life = (
            time_decay_coefficient * 24 * 60 * 60
        )  # convert to seconds
        self.time_decay_flag = timedecay_formula
        self.time_now = time_now
        self.threshold = threshold
        self.user_affinity = None
        self.item_similarity = None
        self.item_frequencies = None
        self.user_frequencies = None

        # threshold - items below this number get set to zero in co-occurrence counts
        if self.threshold <= 0:
            raise ValueError("Threshold cannot be < 1")

        # set flag to capture unity-rating user-affinity matrix for scaling scores
        self.normalize = normalize
        self.col_unity_rating = "_unity_rating"
        self.unity_user_affinity = None

        # column for mapping user / item ids to internal indices
        self.col_item_id = "_indexed_items"
        self.col_user_id = "_indexed_users"

        # obtain all the users and items from both training and test data
        self.n_users = None
        self.n_items = None

        # The min and max of the rating scale, obtained from the training data.
        self.rating_min = None
        self.rating_max = None

        # mapping for item to matrix element
        self.user2index = None
        self.item2index = None

        # the opposite of the above maps - map array index to actual string ID
        self.index2item = None
        self.index2user = None

[docs]    def compute_affinity_matrix(self, df, rating_col):
        """Affinity matrix.

        The user-affinity matrix can be constructed by treating the users and items as
        indices in a sparse matrix, and the events as the data. Here, we're treating
        the ratings as the event weights.  We convert between different sparse-matrix
        formats to de-duplicate user-item pairs, otherwise they will get added up.

        Args:
            df (pandas.DataFrame): Indexed df of users and items
            rating_col (str): Name of column to use for ratings

        Returns:
            sparse.csr: Affinity matrix in Compressed Sparse Row (CSR) format.
        """

        return sparse.coo_matrix(
            (df[rating_col], (df[self.col_user_id], df[self.col_item_id])),
            shape=(self.n_users, self.n_items),
        ).tocsr()

[docs]    def compute_time_decay(self, df, decay_column):
        """Compute time decay on provided column.

        Args:
            df (pandas.DataFrame): DataFrame of users and items
            decay_column (str): column to decay

        Returns:
            pandas.DataFrame: with column decayed
        """

        # if time_now is None use the latest time
        if self.time_now is None:
            self.time_now = df[self.col_timestamp].max()

        # apply time decay to each rating
        df[decay_column] *= exponential_decay(
            value=df[self.col_timestamp],
            max_val=self.time_now,
            half_life=self.time_decay_half_life,
        )

        # group time decayed ratings by user-item and take the sum as the user-item affinity
        return df.groupby([self.col_user, self.col_item]).sum().reset_index()

[docs]    def compute_cooccurrence_matrix(self, df):
        """Co-occurrence matrix.

        The co-occurrence matrix is defined as :math:`C = U^T * U`

        where U is the user_affinity matrix with 1's as values (instead of ratings).

        Args:
            df (pandas.DataFrame): DataFrame of users and items

        Returns:
            numpy.ndarray: Co-occurrence matrix
        """
        user_item_hits = sparse.coo_matrix(
            (np.repeat(1, df.shape[0]), (df[self.col_user_id], df[self.col_item_id])),
            shape=(self.n_users, self.n_items),
        ).tocsr()

        item_cooccurrence = user_item_hits.transpose().dot(user_item_hits)
        item_cooccurrence = item_cooccurrence.multiply(
            item_cooccurrence >= self.threshold
        )

        return item_cooccurrence.astype(df[self.col_rating].dtype)

[docs]    def set_index(self, df):
        """Generate continuous indices for users and items to reduce memory usage.

        Args:
            df (pandas.DataFrame): dataframe with user and item ids
        """

        # generate a map of continuous index values to items
        self.index2item = dict(enumerate(df[self.col_item].unique()))
        self.index2user = dict(enumerate(df[self.col_user].unique()))

        # invert the mappings from above
        self.item2index = {v: k for k, v in self.index2item.items()}
        self.user2index = {v: k for k, v in self.index2user.items()}

        # set values for the total count of users and items
        self.n_users = len(self.user2index)
        self.n_items = len(self.index2item)

[docs]    def fit(self, df):
        """Main fit method for SAR.

        .. note::

        Please make sure that `df` has no duplicates.

        Args:
            df (pandas.DataFrame): User item rating dataframe (without duplicates).
        """
        select_columns = [self.col_user, self.col_item, self.col_rating]
        if self.time_decay_flag:
            select_columns += [self.col_timestamp]

        if df[select_columns].duplicated().any():
            raise ValueError("There should not be duplicates in the dataframe")

        # generate continuous indices if this hasn't been done
        if self.index2item is None:
            self.set_index(df)

        logger.info("Collecting user affinity matrix")
        if not np.issubdtype(df[self.col_rating].dtype, np.number):
            raise TypeError("Rating column data type must be numeric")

        # copy the DataFrame to avoid modification of the input
        temp_df = df[select_columns].copy()

        if self.time_decay_flag:
            logger.info("Calculating time-decayed affinities")
            temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_rating)

        logger.info("Creating index columns")
        # add mapping of user and item ids to indices
        temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].apply(
            lambda item: self.item2index.get(item, np.NaN)
        )
        temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].apply(
            lambda user: self.user2index.get(user, np.NaN)
        )

        if self.normalize:
            self.rating_min = temp_df[self.col_rating].min()
            self.rating_max = temp_df[self.col_rating].max()
            logger.info("Calculating normalization factors")
            temp_df[self.col_unity_rating] = 1.0
            if self.time_decay_flag:
                temp_df = self.compute_time_decay(
                    df=temp_df, decay_column=self.col_unity_rating
                )
            self.unity_user_affinity = self.compute_affinity_matrix(
                df=temp_df, rating_col=self.col_unity_rating
            )

        # affinity matrix
        logger.info("Building user affinity sparse matrix")
        self.user_affinity = self.compute_affinity_matrix(
            df=temp_df, rating_col=self.col_rating
        )

        # calculate item co-occurrence
        logger.info("Calculating item co-occurrence")
        item_cooccurrence = self.compute_cooccurrence_matrix(df=temp_df)

        # free up some space
        del temp_df

        # creates an array with the frequency of every unique item
        self.item_frequencies = item_cooccurrence.diagonal()

        logger.info("Calculating item similarity")
        if self.similarity_type == SIM_COOCCUR:
            logger.info("Using co-occurrence based similarity")
            self.item_similarity = item_cooccurrence
        elif self.similarity_type == SIM_COSINE:
            logger.info("Using cosine similarity")
            self.item_similarity = cosine_similarity(item_cooccurrence)
        elif self.similarity_type == SIM_INCLUSION_INDEX:
            logger.info("Using inclusion index")
            self.item_similarity = inclusion_index(item_cooccurrence)
        elif self.similarity_type == SIM_JACCARD:
            logger.info("Using jaccard based similarity")
            self.item_similarity = jaccard(item_cooccurrence)
        elif self.similarity_type == SIM_LEXICOGRAPHERS_MUTUAL_INFORMATION:
            logger.info("Using lexicographers mutual information similarity")
            self.item_similarity = lexicographers_mutual_information(item_cooccurrence)
        elif self.similarity_type == SIM_LIFT:
            logger.info("Using lift based similarity")
            self.item_similarity = lift(item_cooccurrence)
        elif self.similarity_type == SIM_MUTUAL_INFORMATION:
            logger.info("Using mutual information similarity")
            self.item_similarity = mutual_information(item_cooccurrence)
        else:
            raise ValueError("Unknown similarity type: {}".format(self.similarity_type))

        # free up some space
        del item_cooccurrence

        logger.info("Done training")

[docs]    def score(self, test, remove_seen=False):
        """Score all items for test users.

        Args:
            test (pandas.DataFrame): user to test
            remove_seen (bool): flag to remove items seen in training from recommendation

        Returns:
            numpy.ndarray: Value of interest of all items for the users.
        """

        # get user / item indices from test set
        user_ids = list(
            map(
                lambda user: self.user2index.get(user, np.NaN),
                test[self.col_user].unique(),
            )
        )
        if any(np.isnan(user_ids)):
            raise ValueError("SAR cannot score users that are not in the training set")

        # calculate raw scores with a matrix multiplication
        logger.info("Calculating recommendation scores")
        test_scores = self.user_affinity[user_ids, :].dot(self.item_similarity)

        # ensure we're working with a dense ndarray
        if isinstance(test_scores, sparse.spmatrix):
            test_scores = test_scores.toarray()

        if self.normalize:
            counts = self.unity_user_affinity[user_ids, :].dot(self.item_similarity)
            user_min_scores = (
                np.tile(counts.min(axis=1)[:, np.newaxis], test_scores.shape[1])
                * self.rating_min
            )
            user_max_scores = (
                np.tile(counts.max(axis=1)[:, np.newaxis], test_scores.shape[1])
                * self.rating_max
            )
            test_scores = rescale(
                test_scores,
                self.rating_min,
                self.rating_max,
                user_min_scores,
                user_max_scores,
            )

        # remove items in the train set so recommended items are always novel
        if remove_seen:
            logger.info("Removing seen items")
            test_scores += self.user_affinity[user_ids, :] * -np.inf

        return test_scores

[docs]    def get_popularity_based_topk(self, top_k=10, sort_top_k=True, items=True):
        """Get top K most frequently occurring items across all users.

        Args:
            top_k (int): number of top items to recommend.
            sort_top_k (bool): flag to sort top k results.
            items (bool): if false, return most frequent users instead

        Returns:
            pandas.DataFrame: top k most popular items.
        """
        if items:
            frequencies = self.item_frequencies
            col = self.col_item
            idx = self.index2item
        else:
            if self.user_frequencies is None:
                self.user_frequencies = self.user_affinity.getnnz(axis=1).astype(
                    "int64"
                )
            frequencies = self.user_frequencies
            col = self.col_user
            idx = self.index2user

        test_scores = np.array([frequencies])

        logger.info("Getting top K")
        top_components, top_scores = get_top_k_scored_items(
            scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
        )

        return pd.DataFrame(
            {
                col: [idx[item] for item in top_components.flatten()],
                self.col_prediction: top_scores.flatten(),
            }
        )

[docs]    def get_item_based_topk(self, items, top_k=10, sort_top_k=True):
        """Get top K similar items to provided seed items based on similarity metric defined.
        This method will take a set of items and use them to recommend the most similar items to that set
        based on the similarity matrix fit during training.
        This allows recommendations for cold-users (unseen during training), note - the model is not updated.

        The following options are possible based on information provided in the items input:
        1. Single user or seed of items: only item column (ratings are assumed to be 1)
        2. Single user or seed of items w/ ratings: item column and rating column
        3. Separate users or seeds of items: item and user column (user ids are only used to separate item sets)
        4. Separate users or seeds of items with ratings: item, user and rating columns provided

        Args:
            items (pandas.DataFrame): DataFrame with item, user (optional), and rating (optional) columns
            top_k (int): number of top items to recommend
            sort_top_k (bool): flag to sort top k results

        Returns:
            pandas.DataFrame: sorted top k recommendation items
        """

        # convert item ids to indices
        item_ids = np.asarray(
            list(
                map(
                    lambda item: self.item2index.get(item, np.NaN),
                    items[self.col_item].values,
                )
            )
        )

        # if no ratings were provided assume they are all 1
        if self.col_rating in items.columns:
            ratings = items[self.col_rating]
        else:
            ratings = pd.Series(np.ones_like(item_ids))

        # create local map of user ids
        if self.col_user in items.columns:
            test_users = items[self.col_user]
            user2index = {x[1]: x[0] for x in enumerate(items[self.col_user].unique())}
            user_ids = test_users.map(user2index)
        else:
            # if no user column exists assume all entries are for a single user
            test_users = pd.Series(np.zeros_like(item_ids))
            user_ids = test_users
        n_users = user_ids.drop_duplicates().shape[0]

        # generate pseudo user affinity using seed items
        pseudo_affinity = sparse.coo_matrix(
            (ratings, (user_ids, item_ids)), shape=(n_users, self.n_items)
        ).tocsr()

        # calculate raw scores with a matrix multiplication
        test_scores = pseudo_affinity.dot(self.item_similarity)

        # remove items in the seed set so recommended items are novel
        test_scores[user_ids, item_ids] = -np.inf

        top_items, top_scores = get_top_k_scored_items(
            scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
        )

        df = pd.DataFrame(
            {
                self.col_user: np.repeat(
                    test_users.drop_duplicates().values, top_items.shape[1]
                ),
                self.col_item: [self.index2item[item] for item in top_items.flatten()],
                self.col_prediction: top_scores.flatten(),
            }
        )

        # drop invalid items
        return df.replace(-np.inf, np.nan).dropna()

[docs]    def get_topk_most_similar_users(self, user, top_k, sort_top_k=True):
        """Based on user affinity towards items, calculate the most similar users to the given user.

        Args:
            user (int): user to retrieve most similar users for
            top_k (int): number of top items to recommend
            sort_top_k (bool): flag to sort top k results

        Returns:
            pandas.DataFrame: top k most similar users and their scores
        """
        user_idx = self.user2index[user]
        similarities = self.user_affinity[user_idx].dot(self.user_affinity.T).toarray()
        similarities[0, user_idx] = -np.inf

        top_items, top_scores = get_top_k_scored_items(
            scores=similarities, top_k=top_k, sort_top_k=sort_top_k
        )

        df = pd.DataFrame(
            {
                self.col_user: [self.index2user[user] for user in top_items.flatten()],
                self.col_prediction: top_scores.flatten(),
            }
        )

        # drop invalid items
        return df.replace(-np.inf, np.nan).dropna()

[docs]    def recommend_k_items(self, test, top_k=10, sort_top_k=True, remove_seen=False):
        """Recommend top K items for all users which are in the test set

        Args:
            test (pandas.DataFrame): users to test
            top_k (int): number of top items to recommend
            sort_top_k (bool): flag to sort top k results
            remove_seen (bool): flag to remove items seen in training from recommendation

        Returns:
            pandas.DataFrame: top k recommendation items for each user
        """

        test_scores = self.score(test, remove_seen=remove_seen)

        top_items, top_scores = get_top_k_scored_items(
            scores=test_scores, top_k=top_k, sort_top_k=sort_top_k
        )

        df = pd.DataFrame(
            {
                self.col_user: np.repeat(
                    test[self.col_user].drop_duplicates().values, top_items.shape[1]
                ),
                self.col_item: [self.index2item[item] for item in top_items.flatten()],
                self.col_prediction: top_scores.flatten(),
            }
        )

        # drop invalid items
        return df.replace(-np.inf, np.nan).dropna()

[docs]    def predict(self, test):
        """Output SAR scores for only the users-items pairs which are in the test set

        Args:
            test (pandas.DataFrame): DataFrame that contains users and items to test

        Returns:
            pandas.DataFrame: DataFrame contains the prediction results
        """

        test_scores = self.score(test)
        user_ids = np.asarray(
            list(
                map(
                    lambda user: self.user2index.get(user, np.NaN),
                    test[self.col_user].values,
                )
            )
        )

        # create mapping of new items to zeros
        item_ids = np.asarray(
            list(
                map(
                    lambda item: self.item2index.get(item, np.NaN),
                    test[self.col_item].values,
                )
            )
        )
        nans = np.isnan(item_ids)
        if any(nans):
            logger.warning(
                "Items found in test not seen during training, new items will have score of 0"
            )
            test_scores = np.append(test_scores, np.zeros((self.n_users, 1)), axis=1)
            item_ids[nans] = self.n_items
            item_ids = item_ids.astype("int64")

        df = pd.DataFrame(
            {
                self.col_user: test[self.col_user].values,
                self.col_item: test[self.col_item].values,
                self.col_prediction: test_scores[user_ids, item_ids],
            }
        )
        return df