Source code for recommenders.models.geoimc.geoimc_data

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.

import logging
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, isspmatrix_csr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

from recommenders.utils.python_utils import binarize
from .geoimc_utils import length_normalize, reduce_dims


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("geoimc")


[docs]class DataPtr:
    """
    Holds data and its respective indices
    """

[docs]    def __init__(self, data, entities):
        """Initialize a data pointer

        Args:
            data (csr_matrix): The target data matrix.
            entities (Iterator): An iterator (of 2 elements (ndarray)) containing
            the features of row, col entities.
        """
        assert isspmatrix_csr(data)

        self.data = data
        self.entities = entities
        self.data_indices = None
        self.entity_indices = [None, None]

[docs]    def get_data(self):
        """
        Returns:
            csr_matrix: Target matrix (based on the data_indices filter)
        """
        if self.data_indices is None:
            return self.data
        return self.data[self.data_indices]

[docs]    def get_entity(self, of="row"):
        """Get entity

        Args:
            of (str): The entity, either 'row' or 'col'
        Returns:
            numpy.ndarray: Entity matrix (based on the entity_indices filter)
        """
        idx = 0 if of == "row" else 1
        if self.entity_indices[idx] is None:
            return self.entities[idx]
        return self.entities[idx][self.entity_indices[idx]]


[docs]class Dataset:
    """
    Base class that holds necessary (minimal) information needed
    """

[docs]    def __init__(self, name, features_dim=0, normalize=False, target_transform=""):
        """Initialize parameters

        Args:
            name (str): Name of the dataset
            features_dim (uint): Dimension of the features. If not 0, PCA is performed
                on the features as the dimensionality reduction technique
            normalize (bool): Normalize the features
            target_transform (str): Transform the target values. Current options are
                'normalize' (Normalize the values), '' (Do nothing), 'binarize' (convert
                the values using a threshold defined per dataset)

        """
        self.name = None
        self.training_data = None
        self.test_data = None
        self.entities = None

        self.features_dim = features_dim
        self.feat_normalize = normalize
        self.target_transform = target_transform

[docs]    def normalize(self):
        """Normalizes the entity features"""
        if self.feat_normalize:
            for i in range(len(self.entities)):
                if isspmatrix_csr(self.entities[i]):
                    logger.info("Normalizing CSR matrix")
                    self.entities[i] = normalize(self.entities[i])
                else:
                    self.entities[i] = length_normalize(self.entities[i])

[docs]    def generate_train_test_data(self, data, test_ratio=0.3):
        """Generate train, test split. The split is performed on the row
        entities. So, this essentially becomes a cold start row entity test.

        Args:
            data (csr_matrix): The entire target matrix.
            test_ratio (float): Ratio of test split.

        """
        self.training_data = DataPtr(data, self.entities)
        self.test_data = DataPtr(data, self.entities)

        self.training_data.data_indices, self.test_data.data_indices = train_test_split(
            np.array(range(0, data.shape[0])),
            test_size=test_ratio,
            shuffle=True,
            random_state=0,
        )
        self.training_data.entity_indices[0] = self.training_data.data_indices
        self.test_data.entity_indices[0] = self.test_data.data_indices

[docs]    def reduce_dims(self):
        """Reduces the dimensionality of entity features."""
        if self.features_dim != 0:
            self.entities[0] = reduce_dims(self.entities[0], self.features_dim)
            self.entities[1] = reduce_dims(self.entities[1], self.features_dim)
            logger.info("Dimensionality reduced ...")


[docs]class ML_100K(Dataset):
    """
    Handles MovieLens-100K
    """

[docs]    def __init__(self, **kwargs):
        super().__init__(self.__class__.__name__, **kwargs)
        self.min_rating = 1
        self.max_rating = 5

[docs]    def df2coo(self, df):
        """Convert the input dataframe into a coo matrix

        Args:
            df (pandas.DataFrame): DataFrame containing the target matrix information.
        """
        data = []
        row = list(df["user id"] - 1)
        col = list(df["item id"] - 1)
        for idx in range(0, len(df)):
            val = df["rating"].iloc[idx]
            data += [val]

        if self.target_transform == "normalize":
            data = data / np.sqrt(
                np.sum(np.arange(self.min_rating, self.max_rating + 1) ** 2)
            )
        elif self.target_transform == "binarize":
            data = binarize(np.array(data), 3)

        # TODO: Get this from `u.info`
        return coo_matrix((data, (row, col)), shape=(943, 1682))

    def _read_from_file(self, path):
        """Read the traget matrix from file at path.

        Args:
            path (str): Path to the target matrix
        """
        df = pd.read_csv(
            path,
            delimiter="\t",
            names=["user id", "item id", "rating", "timestamp"],
            encoding="ISO-8859-1",
        )
        df.drop(["timestamp"], axis=1, inplace=True)
        return self.df2coo(df)

[docs]    def load_data(self, path):
        """Load dataset

        Args:
            path (str): Path to the directory containing ML100K dataset
            e1_path (str): Path to the file containing row (user) features of ML100K dataset
            e2_path (str): Path to the file containing col (movie) features of ML100K dataset
        """
        self.entities = [
            self._load_user_features(f"{path}/u.user"),
            self._load_item_features(f"{path}/u.item"),
        ]
        self.normalize()
        self.reduce_dims()
        self.training_data = DataPtr(
            self._read_from_file(f"{path}/u1.base").tocsr(), self.entities
        )
        self.test_data = DataPtr(
            self._read_from_file(f"{path}/u1.test").tocsr(), self.entities
        )

    def _load_user_features(self, path):
        """Load user features

        Args:
            path (str): Path to the file containing user features information

        """
        data = pd.read_csv(
            path,
            delimiter="|",
            names=["user_id", "age", "gender", "occupation", "zip_code"],
        )
        features_df = pd.concat(
            [
                data["user_id"],
                pd.get_dummies(data["user_id"]),
                pd.get_dummies(data["age"]),
                pd.get_dummies(data["gender"]),
                pd.get_dummies(data["occupation"]),
                pd.get_dummies(data["zip_code"]),
            ],
            axis=1,
        )
        features_df.drop(["user_id"], axis=1, inplace=True)
        user_features = np.nan_to_num(features_df.to_numpy())
        return user_features

    def _load_item_features(self, path):
        """Load item features

        Args:
            path (str): Path to the file containing item features information

        """
        header = [
            "movie_id",
            "movie_title",
            "release_date",
            "video_release_date",
            "IMDb_URL",
            "unknown",
            "Action",
            "Adventure",
            "Animation",
            "Childrens",
            "Comedy",
            "Crime",
            "Documentary",
            "Drama",
            "Fantasy",
            "Film-Noir",
            "Horror",
            "Musical",
            "Mystery",
            "Romance",
            "Sci-Fi",
            "Thriller",
            "War",
            "Western",
        ]
        data = pd.read_csv(path, delimiter="|", names=header, encoding="ISO-8859-1")

        features_df = pd.concat(
            [
                pd.get_dummies(data["movie_title"]),
                pd.get_dummies(data["release_date"]),
                pd.get_dummies("video_release_date"),
                pd.get_dummies("IMDb_URL"),
                data[header[5:]],
            ],
            axis=1,
        )
        item_features = np.nan_to_num(features_df.to_numpy())
        return item_features