# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.
import numpy as np
try:
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.sql import Window, DataFrame
from pyspark.sql.functions import col, row_number, expr
from pyspark.sql.functions import udf
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, DoubleType, StructType, StructField
from pyspark.ml.linalg import VectorUDT
except ImportError:
pass # skip this import if we are in pure python environment
from recommenders.utils.constants import (
DEFAULT_PREDICTION_COL,
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
DEFAULT_RELEVANCE_COL,
DEFAULT_SIMILARITY_COL,
DEFAULT_ITEM_FEATURES_COL,
DEFAULT_ITEM_SIM_MEASURE,
DEFAULT_TIMESTAMP_COL,
DEFAULT_K,
DEFAULT_THRESHOLD,
)
[docs]class SparkRatingEvaluation:
"""Spark Rating Evaluator"""
[docs] def __init__(
self,
rating_true,
rating_pred,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
):
"""Initializer.
This is the Spark version of rating metrics evaluator.
The methods of this class, calculate rating metrics such as root mean squared error, mean absolute error,
R squared, and explained variance.
Args:
rating_true (pyspark.sql.DataFrame): True labels.
rating_pred (pyspark.sql.DataFrame): Predicted labels.
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
"""
self.rating_true = rating_true
self.rating_pred = rating_pred
self.col_user = col_user
self.col_item = col_item
self.col_rating = col_rating
self.col_prediction = col_prediction
# Check if inputs are Spark DataFrames.
if not isinstance(self.rating_true, DataFrame):
raise TypeError(
"rating_true should be but is not a Spark DataFrame"
) # pragma : No Cover
if not isinstance(self.rating_pred, DataFrame):
raise TypeError(
"rating_pred should be but is not a Spark DataFrame"
) # pragma : No Cover
# Check if columns exist.
true_columns = self.rating_true.columns
pred_columns = self.rating_pred.columns
if rating_true.count() == 0:
raise ValueError("Empty input dataframe")
if rating_pred.count() == 0:
raise ValueError("Empty input dataframe")
if self.col_user not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing User Col")
if self.col_item not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing Item Col")
if self.col_rating not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing Rating Col")
if self.col_user not in pred_columns:
raise ValueError(
"Schema of rating_pred not valid. Missing User Col"
) # pragma : No Cover
if self.col_item not in pred_columns:
raise ValueError(
"Schema of rating_pred not valid. Missing Item Col"
) # pragma : No Cover
if self.col_prediction not in pred_columns:
raise ValueError("Schema of rating_pred not valid. Missing Prediction Col")
self.rating_true = self.rating_true.select(
col(self.col_user),
col(self.col_item),
col(self.col_rating).cast("double").alias("label"),
)
self.rating_pred = self.rating_pred.select(
col(self.col_user),
col(self.col_item),
col(self.col_prediction).cast("double").alias("prediction"),
)
self.y_pred_true = (
self.rating_true.join(
self.rating_pred, [self.col_user, self.col_item], "inner"
)
.drop(self.col_user)
.drop(self.col_item)
)
self.metrics = RegressionMetrics(
self.y_pred_true.rdd.map(lambda x: (x.prediction, x.label))
)
[docs] def rmse(self):
"""Calculate Root Mean Squared Error.
Returns:
float: Root mean squared error.
"""
return self.metrics.rootMeanSquaredError
[docs] def mae(self):
"""Calculate Mean Absolute Error.
Returns:
float: Mean Absolute Error.
"""
return self.metrics.meanAbsoluteError
[docs] def rsquared(self):
"""Calculate R squared.
Returns:
float: R squared.
"""
return self.metrics.r2
[docs] def exp_var(self):
"""Calculate explained variance.
.. note::
Spark MLLib's implementation is buggy (can lead to values > 1), hence we use var().
Returns:
float: Explained variance (min=0, max=1).
"""
var1 = self.y_pred_true.selectExpr("variance(label - prediction)").collect()[0][
0
]
var2 = self.y_pred_true.selectExpr("variance(label)").collect()[0][0]
# numpy divide is more tolerant to var2 being zero
return 1 - np.divide(var1, var2)
[docs]class SparkRankingEvaluation:
"""Spark Ranking Evaluator"""
[docs] def __init__(
self,
rating_true,
rating_pred,
k=DEFAULT_K,
relevancy_method="top_k",
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
threshold=DEFAULT_THRESHOLD,
):
"""Initialization.
This is the Spark version of ranking metrics evaluator.
The methods of this class, calculate ranking metrics such as precision@k, recall@k, ndcg@k, and mean average
precision.
The implementations of precision@k, ndcg@k, and mean average precision are referenced from Spark MLlib, which
can be found at `here <https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems>`_.
Args:
rating_true (pyspark.sql.DataFrame): DataFrame of true rating data (in the
format of customerID-itemID-rating tuple).
rating_pred (pyspark.sql.DataFrame): DataFrame of predicted rating data (in
the format of customerID-itemID-rating tuple).
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
k (int): number of items to recommend to each user.
relevancy_method (str): method for determining relevant items. Possible
values are "top_k", "by_time_stamp", and "by_threshold".
threshold (float): threshold for determining the relevant recommended items.
This is used for the case that predicted ratings follow a known
distribution. NOTE: this option is only activated if relevancy_method is
set to "by_threshold".
"""
self.rating_true = rating_true
self.rating_pred = rating_pred
self.col_user = col_user
self.col_item = col_item
self.col_rating = col_rating
self.col_prediction = col_prediction
self.threshold = threshold
# Check if inputs are Spark DataFrames.
if not isinstance(self.rating_true, DataFrame):
raise TypeError(
"rating_true should be but is not a Spark DataFrame"
) # pragma : No Cover
if not isinstance(self.rating_pred, DataFrame):
raise TypeError(
"rating_pred should be but is not a Spark DataFrame"
) # pragma : No Cover
# Check if columns exist.
true_columns = self.rating_true.columns
pred_columns = self.rating_pred.columns
if self.col_user not in true_columns:
raise ValueError(
"Schema of rating_true not valid. Missing User Col: "
+ str(true_columns)
)
if self.col_item not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing Item Col")
if self.col_rating not in true_columns:
raise ValueError("Schema of rating_true not valid. Missing Rating Col")
if self.col_user not in pred_columns:
raise ValueError(
"Schema of rating_pred not valid. Missing User Col"
) # pragma : No Cover
if self.col_item not in pred_columns:
raise ValueError(
"Schema of rating_pred not valid. Missing Item Col"
) # pragma : No Cover
if self.col_prediction not in pred_columns:
raise ValueError("Schema of rating_pred not valid. Missing Prediction Col")
self.k = k
relevant_func = {
"top_k": _get_top_k_items,
"by_time_stamp": _get_relevant_items_by_timestamp,
"by_threshold": _get_relevant_items_by_threshold,
}
if relevancy_method not in relevant_func:
raise ValueError(
"relevancy_method should be one of {}".format(
list(relevant_func.keys())
)
)
self.rating_pred = (
relevant_func[relevancy_method](
dataframe=self.rating_pred,
col_user=self.col_user,
col_item=self.col_item,
col_rating=self.col_prediction,
threshold=self.threshold,
)
if relevancy_method == "by_threshold"
else relevant_func[relevancy_method](
dataframe=self.rating_pred,
col_user=self.col_user,
col_item=self.col_item,
col_rating=self.col_prediction,
k=self.k,
)
)
self._metrics = self._calculate_metrics()
def _calculate_metrics(self):
"""Calculate ranking metrics."""
self._items_for_user_pred = self.rating_pred
self._items_for_user_true = (
self.rating_true.groupBy(self.col_user)
.agg(expr("collect_list(" + self.col_item + ") as ground_truth"))
.select(self.col_user, "ground_truth")
)
self._items_for_user_all = self._items_for_user_pred.join(
self._items_for_user_true, on=self.col_user
).drop(self.col_user)
return RankingMetrics(self._items_for_user_all.rdd)
[docs] def precision_at_k(self):
"""Get precision@k.
.. note::
More details can be found
`here <http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.precisionAt>`_.
Return:
float: precision at k (min=0, max=1)
"""
precision = self._metrics.precisionAt(self.k)
return precision
[docs] def recall_at_k(self):
"""Get recall@K.
.. note::
More details can be found
`here <http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.meanAveragePrecision>`_.
Return:
float: recall at k (min=0, max=1).
"""
df_hit = self._items_for_user_all.withColumn(
"hit", F.array_intersect(DEFAULT_PREDICTION_COL, "ground_truth")
)
df_hit = df_hit.withColumn("num_hit", F.size("hit"))
df_hit = df_hit.withColumn("num_actual", F.size("ground_truth"))
df_hit = df_hit.withColumn("per_hit", df_hit["num_hit"] / df_hit["num_actual"])
recall = df_hit.select(F.mean("per_hit")).collect()[0][0]
return recall
[docs] def ndcg_at_k(self):
"""Get Normalized Discounted Cumulative Gain (NDCG)
.. note::
More details can be found
`here <http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.ndcgAt>`_.
Return:
float: nDCG at k (min=0, max=1).
"""
ndcg = self._metrics.ndcgAt(self.k)
return ndcg
[docs] def map_at_k(self):
"""Get mean average precision at k.
.. note::
More details can be found
`here <http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.meanAveragePrecision>`_.
Return:
float: MAP at k (min=0, max=1).
"""
maprecision = self._metrics.meanAveragePrecision
return maprecision
def _get_top_k_items(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
k=DEFAULT_K,
):
"""Get the input customer-item-rating tuple in the format of Spark
DataFrame, output a Spark DataFrame in the dense format of top k items
for each user.
.. note::
if it is implicit rating, just append a column of constants to be ratings.
Args:
dataframe (pyspark.sql.DataFrame): DataFrame of rating data (in the format of
customerID-itemID-rating tuple).
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
k (int): number of items for each user.
Return:
pyspark.sql.DataFrame: DataFrame of top k items for each user.
"""
window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())
# this does not work for rating of the same value.
items_for_user = (
dataframe.select(
col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
)
.where(col("rank") <= k)
.groupby(col_user)
.agg(F.collect_list(col_item).alias(col_prediction))
)
return items_for_user
def _get_relevant_items_by_threshold(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
threshold=DEFAULT_THRESHOLD,
):
"""Get relevant items for each customer in the input rating data.
Relevant items are defined as those having ratings above certain threshold.
The threshold is defined as a statistical measure of the ratings for a
user, e.g., median.
Args:
dataframe: Spark DataFrame of customerID-itemID-rating tuples.
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
threshold (float): threshold for determining the relevant recommended items.
This is used for the case that predicted ratings follow a known
distribution.
Return:
pyspark.sql.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
items.
"""
items_for_user = (
dataframe.orderBy(col_rating, ascending=False)
.where(col_rating + " >= " + str(threshold))
.select(col_user, col_item, col_rating)
.withColumn(
col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))
)
.select(col_user, col_prediction)
.dropDuplicates()
)
return items_for_user
def _get_relevant_items_by_timestamp(
dataframe,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_timestamp=DEFAULT_TIMESTAMP_COL,
col_prediction=DEFAULT_PREDICTION_COL,
k=DEFAULT_K,
):
"""Get relevant items for each customer defined by timestamp.
Relevant items are defined as k items that appear mostly recently
according to timestamps.
Args:
dataframe (pyspark.sql.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
tuples.
col_user (str): column name for user.
col_item (str): column name for item.
col_rating (str): column name for rating.
col_timestamp (str): column name for timestamp.
col_prediction (str): column name for prediction.
k: number of relevent items to be filtered by the function.
Return:
pyspark.sql.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
"""
window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())
items_for_user = (
dataframe.select(
col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
)
.where(col("rank") <= k)
.withColumn(
col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))
)
.select(col_user, col_prediction)
.dropDuplicates([col_user, col_prediction])
)
return items_for_user
[docs]class SparkDiversityEvaluation:
"""Spark Evaluator for diversity, coverage, novelty, serendipity"""
[docs] def __init__(
self,
train_df,
reco_df,
item_feature_df=None,
item_sim_measure=DEFAULT_ITEM_SIM_MEASURE,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_relevance=None,
):
"""Initializer.
This is the Spark version of diversity metrics evaluator.
The methods of this class calculate the following diversity metrics:
* Coverage - it includes two metrics:
1. catalog_coverage, which measures the proportion of items that get recommended from the item catalog;
2. distributional_coverage, which measures how unequally different items are recommended in the
recommendations to all users.
* Novelty - A more novel item indicates it is less popular, i.e. it gets recommended less frequently.
* Diversity - The dissimilarity of items being recommended.
* Serendipity - The "unusualness" or "surprise" of recommendations to a user. When 'col_relevance' is used,
it indicates how "pleasant surprise" of recommendations is to a user.
The metric definitions/formulations are based on the following references with modification:
:Citation:
G. Shani and A. Gunawardana, Evaluating Recommendation Systems,
Recommender Systems Handbook pp. 257-297, 2010.
Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing
serendipity into music recommendation, WSDM 2012
P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems:
choice, discovery and relevance, ECIR 2011
Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems,
eugeneyan.com, April 2020
Args:
train_df (pyspark.sql.DataFrame): Data set with historical data for users and items they
have interacted with; contains col_user, col_item. Assumed to not contain any duplicate rows.
Interaction here follows the *item choice model* from Castells et al.
reco_df (pyspark.sql.DataFrame): Recommender's prediction output, containing col_user, col_item,
col_relevance (optional). Assumed to not contain any duplicate user-item pairs.
item_feature_df (pyspark.sql.DataFrame): (Optional) It is required only when item_sim_measure='item_feature_vector'.
It contains two columns: col_item and features (a feature vector).
item_sim_measure (str): (Optional) This column indicates which item similarity measure to be used.
Available measures include item_cooccurrence_count (default choice) and item_feature_vector.
col_user (str): User id column name.
col_item (str): Item id column name.
col_relevance (str): Optional. This column indicates whether the recommended item is actually
relevant to the user or not.
"""
self.train_df = train_df.select(col_user, col_item)
self.col_user = col_user
self.col_item = col_item
self.sim_col = DEFAULT_SIMILARITY_COL
self.df_cosine_similarity = None
self.df_user_item_serendipity = None
self.df_user_serendipity = None
self.avg_serendipity = None
self.df_item_novelty = None
self.avg_novelty = None
self.df_intralist_similarity = None
self.df_user_diversity = None
self.avg_diversity = None
self.item_feature_df = item_feature_df
self.item_sim_measure = item_sim_measure
if col_relevance is None:
self.col_relevance = DEFAULT_RELEVANCE_COL
# relevance term, default is 1 (relevant) for all
self.reco_df = reco_df.select(
col_user, col_item, F.lit(1.0).alias(self.col_relevance)
)
else:
self.col_relevance = col_relevance
self.reco_df = reco_df.select(
col_user, col_item, F.col(self.col_relevance).cast(DoubleType())
)
if self.item_sim_measure == "item_feature_vector":
self.col_item_features = DEFAULT_ITEM_FEATURES_COL
required_schema = StructType(
(
StructField(self.col_item, IntegerType()),
StructField(self.col_item_features, VectorUDT()),
)
)
if self.item_feature_df is not None:
if str(required_schema) != str(item_feature_df.schema):
raise Exception(
"Incorrect schema! item_feature_df should have schema:"
+ str(required_schema)
)
else:
raise Exception(
"item_feature_df not specified! item_feature_df must be provided "
"if choosing to use item_feature_vector to calculate item similarity. "
"item_feature_df should have schema:" + str(required_schema)
)
# check if reco_df contains any user_item pairs that are already shown in train_df
count_intersection = (
self.train_df.select(self.col_user, self.col_item)
.intersect(self.reco_df.select(self.col_user, self.col_item))
.count()
)
if count_intersection != 0:
raise Exception(
"reco_df should not contain any user_item pairs that are already shown in train_df"
)
def _get_pairwise_items(self, df):
"""Get pairwise combinations of items per user (ignoring duplicate pairs [1,2] == [2,1])"""
return (
df.select(self.col_user, F.col(self.col_item).alias("i1"))
.join(
df.select(
F.col(self.col_user).alias("_user"),
F.col(self.col_item).alias("i2"),
),
(F.col(self.col_user) == F.col("_user")) & (F.col("i1") <= F.col("i2")),
)
.select(self.col_user, "i1", "i2")
)
def _get_cosine_similarity(self, n_partitions=200):
if self.item_sim_measure == "item_cooccurrence_count":
# calculate item-item similarity based on item co-occurrence count
self._get_cooccurrence_similarity(n_partitions)
elif self.item_sim_measure == "item_feature_vector":
# calculate item-item similarity based on item feature vectors
self._get_item_feature_similarity(n_partitions)
else:
raise Exception(
"item_sim_measure not recognized! The available options include 'item_cooccurrence_count' and 'item_feature_vector'."
)
return self.df_cosine_similarity
def _get_cooccurrence_similarity(self, n_partitions):
"""Cosine similarity metric from
:Citation:
Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
introducing serendipity into music recommendation, WSDM 2012
The item indexes in the result are such that i1 <= i2.
"""
if self.df_cosine_similarity is None:
pairs = self._get_pairwise_items(df=self.train_df)
item_count = self.train_df.groupBy(self.col_item).count()
self.df_cosine_similarity = (
pairs.groupBy("i1", "i2")
.count()
.join(
item_count.select(
F.col(self.col_item).alias("i1"),
F.pow(F.col("count"), 0.5).alias("i1_sqrt_count"),
),
on="i1",
)
.join(
item_count.select(
F.col(self.col_item).alias("i2"),
F.pow(F.col("count"), 0.5).alias("i2_sqrt_count"),
),
on="i2",
)
.select(
"i1",
"i2",
(
F.col("count")
/ (F.col("i1_sqrt_count") * F.col("i2_sqrt_count"))
).alias(self.sim_col),
)
.repartition(n_partitions, "i1", "i2")
)
return self.df_cosine_similarity
@staticmethod
@udf(returnType=DoubleType())
def sim_cos(v1, v2):
p = 2
return float(v1.dot(v2)) / float(v1.norm(p) * v2.norm(p))
def _get_item_feature_similarity(self, n_partitions):
"""Cosine similarity metric based on item feature vectors
The item indexes in the result are such that i1 <= i2.
"""
if self.df_cosine_similarity is None:
self.df_cosine_similarity = (
self.item_feature_df.select(
F.col(self.col_item).alias("i1"),
F.col(self.col_item_features).alias("f1"),
)
.join(
self.item_feature_df.select(
F.col(self.col_item).alias("i2"),
F.col(self.col_item_features).alias("f2"),
),
(F.col("i1") <= F.col("i2")),
)
.select("i1", "i2", self.sim_cos("f1", "f2").alias("sim"))
.sort("i1", "i2")
.repartition(n_partitions, "i1", "i2")
)
return self.df_cosine_similarity
# Diversity metrics
def _get_intralist_similarity(self, df):
"""Intra-list similarity from
:Citation:
"Improving Recommendation Lists Through Topic Diversification",
Ziegler, McNee, Konstan and Lausen, 2005.
"""
if self.df_intralist_similarity is None:
pairs = self._get_pairwise_items(df=df)
similarity_df = self._get_cosine_similarity()
# Fillna(0) is needed in the cases where similarity_df does not have an entry for a pair of items.
# e.g. i1 and i2 have never occurred together.
self.df_intralist_similarity = (
pairs.join(similarity_df, on=["i1", "i2"], how="left")
.fillna(0)
.filter(F.col("i1") != F.col("i2"))
.groupBy(self.col_user)
.agg(F.mean(self.sim_col).alias("avg_il_sim"))
.select(self.col_user, "avg_il_sim")
)
return self.df_intralist_similarity
[docs] def user_diversity(self):
"""Calculate average diversity of recommendations for each user.
The metric definition is based on formula (3) in the following reference:
:Citation:
Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
introducing serendipity into music recommendation, WSDM 2012
Returns:
pyspark.sql.dataframe.DataFrame: A dataframe with the following columns: col_user, user_diversity.
"""
if self.df_user_diversity is None:
self.df_intralist_similarity = self._get_intralist_similarity(self.reco_df)
self.df_user_diversity = (
self.df_intralist_similarity.withColumn(
"user_diversity", 1 - F.col("avg_il_sim")
)
.select(self.col_user, "user_diversity")
.orderBy(self.col_user)
)
return self.df_user_diversity
[docs] def diversity(self):
"""Calculate average diversity of recommendations across all users.
Returns:
float: diversity.
"""
if self.avg_diversity is None:
self.df_user_diversity = self.user_diversity()
self.avg_diversity = self.df_user_diversity.agg(
{"user_diversity": "mean"}
).first()[0]
return self.avg_diversity
# Novelty metrics
[docs] def historical_item_novelty(self):
"""Calculate novelty for each item. Novelty is computed as the minus logarithm of
(number of interactions with item / total number of interactions). The definition of the metric
is based on the following reference using the choice model (eqs. 1 and 6):
:Citation:
P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems:
choice, discovery and relevance, ECIR 2011
The novelty of an item can be defined relative to a set of observed events on the set of all items.
These can be events of user choice (item "is picked" by a random user) or user discovery
(item "is known" to a random user). The above definition of novelty reflects a factor of item popularity.
High novelty values correspond to long-tail items in the density function, that few users have interacted
with and low novelty values correspond to popular head items.
Returns:
pyspark.sql.dataframe.DataFrame: A dataframe with the following columns: col_item, item_novelty.
"""
if self.df_item_novelty is None:
n_records = self.train_df.count()
self.df_item_novelty = (
self.train_df.groupBy(self.col_item)
.count()
.withColumn("item_novelty", -F.log2(F.col("count") / n_records))
.select(self.col_item, "item_novelty")
.orderBy(self.col_item)
)
return self.df_item_novelty
[docs] def novelty(self):
"""Calculate the average novelty in a list of recommended items (this assumes that the recommendation list
is already computed). Follows section 5 from
:Citation:
P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems:
choice, discovery and relevance, ECIR 2011
Returns:
pyspark.sql.dataframe.DataFrame: A dataframe with following columns: novelty.
"""
if self.avg_novelty is None:
self.df_item_novelty = self.historical_item_novelty()
n_recommendations = self.reco_df.count()
self.avg_novelty = (
self.reco_df.groupBy(self.col_item)
.count()
.join(self.df_item_novelty, self.col_item)
.selectExpr("sum(count * item_novelty)")
.first()[0]
/ n_recommendations
)
return self.avg_novelty
# Serendipity metrics
[docs] def user_item_serendipity(self):
"""Calculate serendipity of each item in the recommendations for each user.
The metric definition is based on the following references:
:Citation:
Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
introducing serendipity into music recommendation, WSDM 2012
Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems,
eugeneyan.com, April 2020
Returns:
pyspark.sql.dataframe.DataFrame: A dataframe with columns: col_user, col_item, user_item_serendipity.
"""
# for every col_user, col_item in reco_df, join all interacted items from train_df.
# These interacted items are repeated for each item in reco_df for a specific user.
if self.df_user_item_serendipity is None:
self.df_cosine_similarity = self._get_cosine_similarity()
self.df_user_item_serendipity = (
self.reco_df.select(
self.col_user,
self.col_item,
F.col(self.col_item).alias(
"reco_item_tmp"
), # duplicate col_item to keep
)
.join(
self.train_df.select(
self.col_user, F.col(self.col_item).alias("train_item_tmp")
),
on=[self.col_user],
)
.select(
self.col_user,
self.col_item,
F.least(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias(
"i1"
),
F.greatest(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias(
"i2"
),
)
.join(self.df_cosine_similarity, on=["i1", "i2"], how="left")
.fillna(0)
.groupBy(self.col_user, self.col_item)
.agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim"))
.join(self.reco_df, on=[self.col_user, self.col_item])
.withColumn(
"user_item_serendipity",
(1 - F.col("avg_item2interactedHistory_sim"))
* F.col(self.col_relevance),
)
.select(self.col_user, self.col_item, "user_item_serendipity")
.orderBy(self.col_user, self.col_item)
)
return self.df_user_item_serendipity
[docs] def user_serendipity(self):
"""Calculate average serendipity for each user's recommendations.
Returns:
pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_user, user_serendipity.
"""
if self.df_user_serendipity is None:
self.df_user_item_serendipity = self.user_item_serendipity()
self.df_user_serendipity = (
self.df_user_item_serendipity.groupBy(self.col_user)
.agg(F.mean("user_item_serendipity").alias("user_serendipity"))
.orderBy(self.col_user)
)
return self.df_user_serendipity
[docs] def serendipity(self):
"""Calculate average serendipity for recommendations across all users.
Returns:
float: serendipity.
"""
if self.avg_serendipity is None:
self.df_user_serendipity = self.user_serendipity()
self.avg_serendipity = self.df_user_serendipity.agg(
{"user_serendipity": "mean"}
).first()[0]
return self.avg_serendipity
# Coverage metrics
[docs] def catalog_coverage(self):
"""Calculate catalog coverage for recommendations across all users.
The metric definition is based on the "catalog coverage" definition in the following reference:
:Citation:
G. Shani and A. Gunawardana, Evaluating Recommendation Systems,
Recommender Systems Handbook pp. 257-297, 2010.
Returns:
float: catalog coverage
"""
# distinct item count in reco_df
count_distinct_item_reco = self.reco_df.select(self.col_item).distinct().count()
# distinct item count in train_df
count_distinct_item_train = (
self.train_df.select(self.col_item).distinct().count()
)
# catalog coverage
c_coverage = count_distinct_item_reco / count_distinct_item_train
return c_coverage
[docs] def distributional_coverage(self):
"""Calculate distributional coverage for recommendations across all users.
The metric definition is based on formula (21) in the following reference:
:Citation:
G. Shani and A. Gunawardana, Evaluating Recommendation Systems,
Recommender Systems Handbook pp. 257-297, 2010.
Returns:
float: distributional coverage
"""
# In reco_df, how many times each col_item is being recommended
df_itemcnt_reco = self.reco_df.groupBy(self.col_item).count()
# the number of total recommendations
count_row_reco = self.reco_df.count()
df_entropy = df_itemcnt_reco.withColumn(
"p(i)", F.col("count") / count_row_reco
).withColumn("entropy(i)", F.col("p(i)") * F.log2(F.col("p(i)")))
# distributional coverage
d_coverage = -df_entropy.agg(F.sum("entropy(i)")).collect()[0][0]
return d_coverage