Source code for recommenders.utils.python_utils

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.

import logging
import numpy as np
from scipy import sparse


logger = logging.getLogger()


[docs]def exponential_decay(value, max_val, half_life):
    """Compute decay factor for a given value based on an exponential decay.

    Values greater than `max_val` will be set to 1.

    Args:
        value (numeric): Value to calculate decay factor
        max_val (numeric): Value at which decay factor will be 1
        half_life (numeric): Value at which decay factor will be 0.5

    Returns:
        float: Decay factor
    """
    return np.minimum(1.0, np.power(0.5, (max_val - value) / half_life))


def _get_row_and_column_matrix(array):
    """Helper method to get the row and column matrix from an array.

    Args:
        array (numpy.ndarray): the array from which to get the row and column matrix.

    Returns:
        (numpy.ndarray, numpy.ndarray): (row matrix, column matrix)
    """
    row_matrix = np.expand_dims(array, axis=0)
    column_matrix = np.expand_dims(array, axis=1)
    return row_matrix, column_matrix


[docs]def jaccard(cooccurrence):
    """Helper method to calculate the Jaccard similarity of a matrix of
    co-occurrences.  When comparing Jaccard with count co-occurrence
    and lift similarity, count favours predictability, meaning that
    the most popular items will be recommended most of the time. Lift,
    by contrast, favours discoverability/serendipity, meaning that an
    item that is less popular overall but highly favoured by a small
    subset of users is more likely to be recommended. Jaccard is a
    compromise between the two.

    Args:
        cooccurrence (numpy.ndarray): the symmetric matrix of co-occurrences of items.

    Returns:
        numpy.ndarray: The matrix of Jaccard similarities between any two items.

    """

    diag_rows, diag_cols = _get_row_and_column_matrix(cooccurrence.diagonal())

    with np.errstate(invalid="ignore", divide="ignore"):
        result = cooccurrence / (diag_rows + diag_cols - cooccurrence)

    return np.array(result)


[docs]def lift(cooccurrence):
    """Helper method to calculate the Lift of a matrix of
    co-occurrences. In comparison with basic co-occurrence and Jaccard
    similarity, lift favours discoverability and serendipity, as
    opposed to co-occurrence that favours the most popular items, and
    Jaccard that is a compromise between the two.

    Args:
        cooccurrence (numpy.ndarray): The symmetric matrix of co-occurrences of items.

    Returns:
        numpy.ndarray: The matrix of Lifts between any two items.

    """

    diag_rows, diag_cols = _get_row_and_column_matrix(cooccurrence.diagonal())

    with np.errstate(invalid="ignore", divide="ignore"):
        result = cooccurrence / (diag_rows * diag_cols)

    return np.array(result)


[docs]def mutual_information(cooccurrence):
    """Helper method to calculate the Mutual Information of a matrix of
    co-occurrences.

    Mutual information is a measurement of the amount of information
    explained by the i-th j-th item column vector.

    Args:
        cooccurrence (numpy.ndarray): The symmetric matrix of co-occurrences of items.

    Returns:
        numpy.ndarray: The matrix of mutual information between any two items.

    """

    with np.errstate(invalid="ignore", divide="ignore"):
        result = np.log2(cooccurrence.shape[0] * lift(cooccurrence))

    return np.array(result)


[docs]def lexicographers_mutual_information(cooccurrence):
    """Helper method to calculate the Lexicographers Mutual Information of
    a matrix of co-occurrences.

    Due to the bias of mutual information for low frequency items,
    lexicographers mutual information corrects the formula by
    multiplying it by the co-occurrence frequency.

    Args:
        cooccurrence (numpy.ndarray): The symmetric matrix of co-occurrences of items.

    Returns:
        numpy.ndarray: The matrix of lexicographers mutual information between any two items.

    """

    with np.errstate(invalid="ignore", divide="ignore"):
        result = cooccurrence * mutual_information(cooccurrence)

    return np.array(result)


[docs]def cosine_similarity(cooccurrence):
    """Helper method to calculate the Cosine similarity of a matrix of
    co-occurrences.

    Cosine similarity can be interpreted as the angle between the i-th
    and j-th item.

    Args:
        cooccurrence (numpy.ndarray): The symmetric matrix of co-occurrences of items.

    Returns:
        numpy.ndarray: The matrix of cosine similarity between any two items.

    """

    diag_rows, diag_cols = _get_row_and_column_matrix(cooccurrence.diagonal())

    with np.errstate(invalid="ignore", divide="ignore"):
        result = cooccurrence / np.sqrt(diag_rows * diag_cols)

    return np.array(result)


[docs]def inclusion_index(cooccurrence):
    """Helper method to calculate the Inclusion Index of a matrix of
    co-occurrences.

    Inclusion index measures the overlap between items.

    Args:
        cooccurrence (numpy.ndarray): The symmetric matrix of co-occurrences of items.

    Returns:
        numpy.ndarray: The matrix of inclusion index between any two items.

    """

    diag_rows, diag_cols = _get_row_and_column_matrix(cooccurrence.diagonal())

    with np.errstate(invalid="ignore", divide="ignore"):
        result = cooccurrence / np.minimum(diag_rows, diag_cols)

    return np.array(result)


[docs]def get_top_k_scored_items(scores, top_k, sort_top_k=False):
    """Extract top K items from a matrix of scores for each user-item pair, optionally sort results per user.

    Args:
        scores (numpy.ndarray): Score matrix (users x items).
        top_k (int): Number of top items to recommend.
        sort_top_k (bool): Flag to sort top k results.

    Returns:
        numpy.ndarray, numpy.ndarray:
        - Indices into score matrix for each user's top items.
        - Scores corresponding to top items.

    """

    # ensure we're working with a dense ndarray
    if isinstance(scores, sparse.spmatrix):
        scores = scores.todense()

    if scores.shape[1] < top_k:
        logger.warning(
            "Number of items is less than top_k, limiting top_k to number of items"
        )
    k = min(top_k, scores.shape[1])

    test_user_idx = np.arange(scores.shape[0])[:, None]

    # get top K items and scores
    # this determines the un-ordered top-k item indices for each user
    top_items = np.argpartition(scores, -k, axis=1)[:, -k:]
    top_scores = scores[test_user_idx, top_items]

    if sort_top_k:
        sort_ind = np.argsort(-top_scores)
        top_items = top_items[test_user_idx, sort_ind]
        top_scores = top_scores[test_user_idx, sort_ind]

    return np.array(top_items), np.array(top_scores)


[docs]def binarize(a, threshold):
    """Binarize the values.

    Args:
        a (numpy.ndarray): Input array that needs to be binarized.
        threshold (float): Threshold below which all values are set to 0, else 1.

    Returns:
        numpy.ndarray: Binarized array.
    """
    return np.where(a > threshold, 1.0, 0.0)


[docs]def rescale(data, new_min=0, new_max=1, data_min=None, data_max=None):
    """Rescale/normalize the data to be within the range `[new_min, new_max]`
    If data_min and data_max are explicitly provided, they will be used
    as the old min/max values instead of taken from the data.

    .. note::
        This is same as the `scipy.MinMaxScaler` with the exception that we can override
        the min/max of the old scale.

    Args:
        data (numpy.ndarray): 1d scores vector or 2d score matrix (users x items).
        new_min (int|float): The minimum of the newly scaled data.
        new_max (int|float): The maximum of the newly scaled data.
        data_min (None|number): The minimum of the passed data [if omitted it will be inferred].
        data_max (None|number): The maximum of the passed data [if omitted it will be inferred].

    Returns:
        numpy.ndarray: The newly scaled/normalized data.
    """
    data_min = data.min() if data_min is None else data_min
    data_max = data.max() if data_max is None else data_max
    return (data - data_min) / (data_max - data_min) * (new_max - new_min) + new_min