# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.
import pandas as pd
import numpy as np
import seaborn as sns
from lightfm.evaluation import precision_at_k, recall_at_k
[docs]def model_perf_plots(df):
"""Function to plot model performance metrics.
Args:
df (pandas.DataFrame): Dataframe in tidy format, with ['epoch','level','value'] columns
Returns:
object: matplotlib axes
"""
g = sns.FacetGrid(df, col="metric", hue="stage", col_wrap=2, sharey=False)
g = g.map(sns.scatterplot, "epoch", "value").add_legend()
[docs]def compare_metric(df_list, metric="prec", stage="test"):
"""Function to combine and prepare list of dataframes into tidy format.
Args:
df_list (list): List of dataframes
metrics (str): name of metric to be extracted, optional
stage (str): name of model fitting stage to be extracted, optional
Returns:
pandas.DataFrame: Metrics
"""
colnames = ["model" + str(x) for x in list(range(1, len(df_list) + 1))]
models = [
df[(df["stage"] == stage) & (df["metric"] == metric)]["value"]
.reset_index(drop=True)
.values
for df in df_list
]
output = pd.DataFrame(zip(*models), columns=colnames).stack().reset_index()
output.columns = ["epoch", "data", "value"]
return output
[docs]def track_model_metrics(
model,
train_interactions,
test_interactions,
k=10,
no_epochs=100,
no_threads=8,
show_plot=True,
**kwargs
):
"""Function to record model's performance at each epoch, formats the performance into tidy format,
plots the performance and outputs the performance data.
Args:
model (LightFM instance): fitted LightFM model
train_interactions (scipy sparse COO matrix): train interactions set
test_interactions (scipy sparse COO matrix): test interaction set
k (int): number of recommendations, optional
no_epochs (int): Number of epochs to run, optional
no_threads (int): Number of parallel threads to use, optional
**kwargs: other keyword arguments to be passed down
Returns:
pandas.DataFrame, LightFM model, matplotlib axes:
- Performance traces of the fitted model
- Fitted model
- Side effect of the method
"""
# initialising temp data storage
model_prec_train = [0] * no_epochs
model_prec_test = [0] * no_epochs
model_rec_train = [0] * no_epochs
model_rec_test = [0] * no_epochs
# fit model and store train/test metrics at each epoch
for epoch in range(no_epochs):
model.fit_partial(
interactions=train_interactions, epochs=1, num_threads=no_threads, **kwargs
)
model_prec_train[epoch] = precision_at_k(
model, train_interactions, k=k, **kwargs
).mean()
model_prec_test[epoch] = precision_at_k(
model, test_interactions, k=k, **kwargs
).mean()
model_rec_train[epoch] = recall_at_k(
model, train_interactions, k=k, **kwargs
).mean()
model_rec_test[epoch] = recall_at_k(
model, test_interactions, k=k, **kwargs
).mean()
# collect the performance metrics into a dataframe
fitting_metrics = pd.DataFrame(
zip(model_prec_train, model_prec_test, model_rec_train, model_rec_test),
columns=[
"model_prec_train",
"model_prec_test",
"model_rec_train",
"model_rec_test",
],
)
# convert into tidy format
fitting_metrics = fitting_metrics.stack().reset_index()
fitting_metrics.columns = ["epoch", "level", "value"]
# exact the labels for each observation
fitting_metrics["stage"] = fitting_metrics.level.str.split("_").str[-1]
fitting_metrics["metric"] = fitting_metrics.level.str.split("_").str[1]
fitting_metrics.drop(["level"], axis=1, inplace=True)
# replace the metric keys to improve visualisation
metric_keys = {"prec": "Precision", "rec": "Recall"}
fitting_metrics.metric.replace(metric_keys, inplace=True)
# plots the performance data
if show_plot:
model_perf_plots(fitting_metrics)
return fitting_metrics, model
[docs]def similar_users(user_id, user_features, model, N=10):
"""Function to return top N similar users based on https://github.com/lyst/lightfm/issues/244#issuecomment-355305681
Args:
user_id (int): id of user to be used as reference
user_features (scipy sparse CSR matrix): user feature matric
model (LightFM instance): fitted LightFM model
N (int): Number of top similar users to return
Returns:
pandas.DataFrame: top N most similar users with score
"""
_, user_representations = model.get_user_representations(features=user_features)
# Cosine similarity
scores = user_representations.dot(user_representations[user_id, :])
user_norms = np.linalg.norm(user_representations, axis=1)
user_norms[user_norms == 0] = 1e-10
scores /= user_norms
best = np.argpartition(scores, -(N + 1))[-(N + 1) :]
return pd.DataFrame(
sorted(zip(best, scores[best] / user_norms[user_id]), key=lambda x: -x[1])[1:],
columns=["userID", "score"],
)
[docs]def similar_items(item_id, item_features, model, N=10):
"""Function to return top N similar items
based on https://github.com/lyst/lightfm/issues/244#issuecomment-355305681
Args:
item_id (int): id of item to be used as reference
item_features (scipy sparse CSR matrix): item feature matric
model (LightFM instance): fitted LightFM model
N (int): Number of top similar items to return
Returns:
pandas.DataFrame: top N most similar items with score
"""
_, item_representations = model.get_item_representations(features=item_features)
# Cosine similarity
scores = item_representations.dot(item_representations[item_id, :])
item_norms = np.linalg.norm(item_representations, axis=1)
item_norms[item_norms == 0] = 1e-10
scores /= item_norms
best = np.argpartition(scores, -(N + 1))[-(N + 1) :]
return pd.DataFrame(
sorted(zip(best, scores[best] / item_norms[item_id]), key=lambda x: -x[1])[1:],
columns=["itemID", "score"],
)
[docs]def prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights):
"""Function to prepare test df for evaluation
Args:
test_idx (slice): slice of test indices
uids (numpy.ndarray): Array of internal user indices
iids (numpy.ndarray): Array of internal item indices
uid_map (dict): Keys to map internal user indices to external ids.
iid_map (dict): Keys to map internal item indices to external ids.
weights (numpy.float32 coo_matrix): user-item interaction
Returns:
pandas.DataFrame: user-item selected for testing
"""
test_df = pd.DataFrame(
zip(
uids[test_idx],
iids[test_idx],
[list(uid_map.keys())[x] for x in uids[test_idx]],
[list(iid_map.keys())[x] for x in iids[test_idx]],
),
columns=["uid", "iid", "userID", "itemID"],
)
dok_weights = weights.todok()
test_df["rating"] = test_df.apply(lambda x: dok_weights[x.uid, x.iid], axis=1)
return test_df[["userID", "itemID", "rating"]]
[docs]def prepare_all_predictions(
data,
uid_map,
iid_map,
interactions,
model,
num_threads,
user_features=None,
item_features=None,
):
"""Function to prepare all predictions for evaluation.
Args:
data (pandas df): dataframe of all users, items and ratings as loaded
uid_map (dict): Keys to map internal user indices to external ids.
iid_map (dict): Keys to map internal item indices to external ids.
interactions (np.float32 coo_matrix): user-item interaction
model (LightFM instance): fitted LightFM model
num_threads (int): number of parallel computation threads
user_features (np.float32 csr_matrix): User weights over features
item_features (np.float32 csr_matrix): Item weights over features
Returns:
pandas.DataFrame: all predictions
"""
users, items, preds = [], [], [] # noqa: F841
item = list(data.itemID.unique())
for user in data.userID.unique():
user = [user] * len(item)
users.extend(user)
items.extend(item)
all_predictions = pd.DataFrame(data={"userID": users, "itemID": items})
all_predictions["uid"] = all_predictions.userID.map(uid_map)
all_predictions["iid"] = all_predictions.itemID.map(iid_map)
dok_weights = interactions.todok()
all_predictions["rating"] = all_predictions.apply(
lambda x: dok_weights[x.uid, x.iid], axis=1
)
all_predictions = all_predictions[all_predictions.rating < 1].reset_index(drop=True)
all_predictions = all_predictions.drop("rating", axis=1)
all_predictions["prediction"] = all_predictions.apply(
lambda x: model.predict(
user_ids=np.array([x["uid"]], dtype=np.int32),
item_ids=np.array([x["iid"]], dtype=np.int32),
user_features=user_features,
item_features=item_features,
num_threads=num_threads,
)[0],
axis=1,
)
return all_predictions[["userID", "itemID", "prediction"]]