Source code for recommenders.datasets.sparse

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.

import pandas as pd
import numpy as np
import itertools

from scipy.sparse import coo_matrix
import logging

# import default parameters
from recommenders.utils.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_PREDICTION_COL,
)


log = logging.getLogger(__name__)


[docs]class AffinityMatrix:
    """Generate the user/item affinity matrix from a pandas dataframe and vice versa"""

[docs]    def __init__(
        self,
        df,
        items_list=None,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_pred=DEFAULT_PREDICTION_COL,
        save_path=None,
    ):
        """Initialize class parameters

        Args:
            df (pandas.DataFrame): a dataframe containing the data
            items_list (numpy.ndarray): a list of unique items to use (if provided)
            col_user (str): default name for user column
            col_item (str): default name for item column
            col_rating (str): default name for rating columns
            save_path (str): default path to save item/user maps
        """
        self.df = df  # dataframe
        self.items_list = items_list  # list of unique items

        # pandas DF parameters
        self.col_item = col_item
        self.col_user = col_user
        self.col_rating = col_rating
        self.col_pred = col_pred

        # Options to save the model for future use
        self.save_path = save_path

    def _gen_index(self):
        """
        Generate the user/item index:
        map_users, map_items: dictionaries mapping the original user/item index to matrix indices
        map_back_users, map_back_items: dictionaries to map back the matrix elements to the original
        dataframe indices

        Basic mechanics:
        As a first step we retieve the unique elements in the dataset. In this way we can take care
        of either completely missing rows (a user with no ratings) or completely missing columns
        (an item that has not being reviewed by anyone). The original indices in the dataframe are
        then mapped to an ordered, contiguous integer series to generate a compact matrix representation.
        Functions to map back to the original indices are also provided and can be saved in order to use
        a pretrained model.
        """
        # sort entries by user index
        self.df_ = self.df.sort_values(by=[self.col_user])

        # find unique user and item index
        unique_users = self.df_[self.col_user].unique()

        if self.items_list is not None:
            unique_items = self.items_list  # use this list if provided
        else:
            unique_items = self.df_[
                self.col_item
            ].unique()  # otherwise use unique items from DF

        self.Nusers = len(unique_users)
        self.Nitems = len(unique_items)

        # create a dictionary to map unique users/items to hashed values to generate the matrix
        self.map_users = {x: i for i, x in enumerate(unique_users)}
        self.map_items = {x: i for i, x in enumerate(unique_items)}

        # map back functions used to get back the original dataframe
        self.map_back_users = {i: x for i, x in enumerate(unique_users)}
        self.map_back_items = {i: x for i, x in enumerate(unique_items)}

        self.df_.loc[:, "hashedItems"] = self.df_[self.col_item].map(self.map_items)
        self.df_.loc[:, "hashedUsers"] = self.df_[self.col_user].map(self.map_users)

        # optionally save the inverse dictionary to work with trained models
        if self.save_path is not None:

            np.save(self.save_path + "/user_dict", self.map_users)
            np.save(self.save_path + "/item_dict", self.map_items)

            np.save(self.save_path + "/user_back_dict", self.map_back_users)
            np.save(self.save_path + "/item_back_dict", self.map_back_items)

[docs]    def gen_affinity_matrix(self):
        """Generate the user/item affinity matrix.

        As a first step, two new columns are added to the input DF, containing the index maps
        generated by the gen_index() method. The new indices, together with the ratings, are
        then used to generate the user/item affinity matrix using scipy's sparse matrix method
        coo_matrix; for reference see:
        https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html.
        The input format is: `coo_matrix((data, (rows, columns)), shape=(rows, columns))`

        Returns:
            scipy.sparse.coo_matrix: User-affinity matrix of dimensions (Nusers, Nitems) in numpy format.
            Unrated movies are assigned a value of 0.
        """

        log.info("Generating the user/item affinity matrix...")

        self._gen_index()

        ratings = self.df_[self.col_rating]  # ratings
        itm_id = self.df_["hashedItems"]  # itm_id serving as columns
        usr_id = self.df_["hashedUsers"]  # usr_id serving as rows

        # generate a sparse matrix representation using scipy's coo_matrix and convert to array format
        self.AM = coo_matrix(
            (ratings, (usr_id, itm_id)), shape=(self.Nusers, self.Nitems)
        ).toarray()

        zero = (self.AM == 0).sum()  # number of unrated items
        total = self.AM.shape[0] * self.AM.shape[1]  # number of elements in the matrix
        sparsness = zero / total * 100  # Percentage of zeros in the matrix

        log.info("Matrix generated, sparseness percentage: %d" % sparsness)

        return self.AM, self.map_users, self.map_items

[docs]    def map_back_sparse(self, X, kind):
        """Map back the user/affinity matrix to a pd dataframe

        Args:
            X (numpy.ndarray, int32): user/item affinity matrix
            kind (string): specify if the output values are ratings or predictions
        Returns:
            pandas.DataFrame: the generated pandas dataframe
        """
        m, n = X.shape

        # 1) Create a DF from a sparse matrix
        # obtain the non zero items
        items = [np.asanyarray(np.where(X[i, :] != 0)).flatten() for i in range(m)]
        ratings = [X[i, items[i]] for i in range(m)]  # obtain the non-zero ratings

        # Creates user ids following the DF format
        userids = []
        for i in range(0, m):
            userids.extend([i] * len(items[i]))

        # Flatten the lists to follow the DF input format
        items = list(itertools.chain.from_iterable(items))
        ratings = list(itertools.chain.from_iterable(ratings))

        if kind == "ratings":
            col_out = self.col_rating
        else:
            col_out = self.col_pred

        # create a df
        out_df = pd.DataFrame.from_dict(
            {self.col_user: userids, self.col_item: items, col_out: ratings}
        )

        # 2) map back user/item ids to their original value

        out_df[self.col_user] = out_df[self.col_user].map(self.map_back_users)
        out_df[self.col_item] = out_df[self.col_item].map(self.map_back_items)

        return out_df