Source code for recommenders.models.deeprec.io.nextitnet_iterator

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.

import tensorflow as tf
import numpy as np
import random

from recommenders.models.deeprec.io.sequential_iterator import SequentialIterator
from recommenders.models.deeprec.deeprec_utils import load_dict


__all__ = ["NextItNetIterator"]


[docs]class NextItNetIterator(SequentialIterator):
    """Data loader for the NextItNet model.

    NextItNet requires a special type of data format. In training stage, each instance will
    produce `(sequence_length * train_num_ngs)` target items and labels, to let NextItNet
    output predictions of every item in a sequence except only of the last item.
    """

[docs]    def __init__(self, hparams, graph, col_spliter="\t"):
        """Initialize an iterator. Create necessary placeholders for the model.
        Different from sequential iterator

        Args:
            hparams (object): Global hyper-parameters. Some key settings such as #_feature and #_field are there.
            graph (object): The running graph. All created placeholder will be added to this graph.
            col_spliter (str): Column splitter in one line.
        """
        self.col_spliter = col_spliter

        self.userdict, self.itemdict, self.catedict = (
            load_dict(hparams.user_vocab),
            load_dict(hparams.item_vocab),
            load_dict(hparams.cate_vocab),
        )

        self.max_seq_length = hparams.max_seq_length
        self.batch_size = hparams.batch_size
        self.iter_data = dict()

        self.graph = graph
        with self.graph.as_default():
            self.labels = tf.compat.v1.placeholder(
                tf.float32, [None, None], name="label"
            )
            self.users = tf.compat.v1.placeholder(tf.int32, [None], name="users")
            self.items = tf.compat.v1.placeholder(tf.int32, [None, None], name="items")
            self.cates = tf.compat.v1.placeholder(tf.int32, [None, None], name="cates")
            self.item_history = tf.compat.v1.placeholder(
                tf.int32, [None, self.max_seq_length], name="item_history"
            )
            self.item_cate_history = tf.compat.v1.placeholder(
                tf.int32, [None, self.max_seq_length], name="item_cate_history"
            )
            self.mask = tf.compat.v1.placeholder(
                tf.int32, [None, self.max_seq_length], name="mask"
            )
            self.time = tf.compat.v1.placeholder(tf.float32, [None], name="time")
            self.time_diff = tf.compat.v1.placeholder(
                tf.float32, [None, self.max_seq_length], name="time_diff"
            )
            self.time_from_first_action = tf.compat.v1.placeholder(
                tf.float32, [None, self.max_seq_length], name="time_from_first_action"
            )
            self.time_to_now = tf.compat.v1.placeholder(
                tf.float32, [None, self.max_seq_length], name="time_to_now"
            )

    def _convert_data(
        self,
        label_list,
        user_list,
        item_list,
        item_cate_list,
        item_history_batch,
        item_cate_history_batch,
        time_list,
        time_diff_list,
        time_from_first_action_list,
        time_to_now_list,
        batch_num_ngs,
    ):
        """Convert data into numpy arrays that are good for further model operation.
        Note: This is different from `sequential_iterator`.

        Args:
            label_list (list): A list of ground-truth labels.
            user_list (list): A list of user indexes.
            item_list (list): A list of item indexes.
            item_cate_list (list): A list of category indexes.
            item_history_batch (list): A list of item history indexes.
            item_cate_history_batch (list): A list of category history indexes.
            time_list (list): A list of current timestamp.
            time_diff_list (list): A list of timestamp between each sequential opertions.
            time_from_first_action_list (list): A list of timestamp from the first opertion.
            time_to_now_list (list): A list of timestamp to the current time.
            batch_num_ngs (int): The number of negative sampling while training in mini-batch.

        Returns:
            dict: A dictionary, contains multiple numpy arrays that are convenient for further operation.
        """
        if batch_num_ngs:
            instance_cnt = len(label_list)
            if instance_cnt < 5:
                return

            label_list_all = []
            item_list_all = []
            item_cate_list_all = []
            user_list_all = np.asarray(
                [[user] * (batch_num_ngs + 1) for user in user_list], dtype=np.int32
            ).flatten()
            time_list_all = np.asarray(
                [[t] * (batch_num_ngs + 1) for t in time_list], dtype=np.float32
            ).flatten()

            history_lengths = [len(item_history_batch[i]) for i in range(instance_cnt)]
            max_seq_length_batch = self.max_seq_length
            item_history_batch_all = np.zeros(
                (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch),
                dtype=np.int32,
            )
            item_cate_history_batch_all = np.zeros(
                (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch),
                dtype=np.int32,
            )
            time_diff_batch = np.zeros(
                (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch),
                dtype=np.float32,
            )
            time_from_first_action_batch = np.zeros(
                (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch),
                dtype=np.float32,
            )
            time_to_now_batch = np.zeros(
                (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch),
                dtype=np.float32,
            )
            mask = np.zeros(
                (instance_cnt * (1 + batch_num_ngs), max_seq_length_batch),
                dtype=np.float32,
            )

            for i in range(instance_cnt):
                this_length = min(history_lengths[i], max_seq_length_batch)
                for index in range(batch_num_ngs + 1):
                    item_history_batch_all[
                        i * (batch_num_ngs + 1) + index, -this_length:
                    ] = np.asarray(item_history_batch[i][-this_length:], dtype=np.int32)
                    item_cate_history_batch_all[
                        i * (batch_num_ngs + 1) + index, -this_length:
                    ] = np.asarray(
                        item_cate_history_batch[i][-this_length:], dtype=np.int32
                    )
                    mask[i * (batch_num_ngs + 1) + index, -this_length:] = 1.0
                    time_diff_batch[
                        i * (batch_num_ngs + 1) + index, -this_length:
                    ] = np.asarray(time_diff_list[i][-this_length:], dtype=np.float32)
                    time_from_first_action_batch[
                        i * (batch_num_ngs + 1) + index, -this_length:
                    ] = np.asarray(
                        time_from_first_action_list[i][-this_length:], dtype=np.float32
                    )
                    time_to_now_batch[
                        i * (batch_num_ngs + 1) + index, -this_length:
                    ] = np.asarray(time_to_now_list[i][-this_length:], dtype=np.float32)

            for i in range(instance_cnt):
                positive_item = [
                    *item_history_batch_all[i * (batch_num_ngs + 1)][1:],
                    item_list[i],
                ]
                positive_item_cate = [
                    *item_cate_history_batch_all[i * (batch_num_ngs + 1)][1:],
                    item_cate_list[i],
                ]
                label_list_all.append([1] * max_seq_length_batch)
                item_list_all.append(positive_item)
                item_cate_list_all.append(positive_item_cate)

                count = 0
                while count < batch_num_ngs:
                    negative_item_list = []
                    negative_item_cate_list = []
                    count_inner = 1
                    while count_inner <= max_seq_length_batch:
                        random_value = random.randint(0, instance_cnt - 1)
                        negative_item = item_list[random_value]
                        if negative_item == positive_item[count_inner - 1]:
                            continue
                        negative_item_list.append(negative_item)
                        negative_item_cate_list.append(item_cate_list[random_value])
                        count_inner += 1

                    label_list_all.append([0] * max_seq_length_batch)
                    item_list_all.append(negative_item_list)
                    item_cate_list_all.append(negative_item_cate_list)
                    count += 1

            res = {}
            res["labels"] = np.asarray(
                label_list_all, dtype=np.float32
            )  # .reshape(-1,1)
            res["users"] = user_list_all
            res["items"] = np.asarray(item_list_all, dtype=np.int32)
            res["cates"] = np.asarray(item_cate_list_all, dtype=np.int32)
            res["item_history"] = item_history_batch_all
            res["item_cate_history"] = item_cate_history_batch_all
            res["mask"] = mask
            res["time"] = time_list_all
            res["time_diff"] = time_diff_batch
            res["time_from_first_action"] = time_from_first_action_batch
            res["time_to_now"] = time_to_now_batch

            return res

        else:
            instance_cnt = len(label_list)
            history_lengths = [len(item_history_batch[i]) for i in range(instance_cnt)]
            max_seq_length_batch = self.max_seq_length
            item_history_batch_all = np.zeros(
                (instance_cnt, max_seq_length_batch), dtype=np.int32
            )
            item_cate_history_batch_all = np.zeros(
                (instance_cnt, max_seq_length_batch), dtype=np.int32
            )
            time_diff_batch = np.zeros(
                (instance_cnt, max_seq_length_batch), dtype=np.float32
            )
            time_from_first_action_batch = np.zeros(
                (instance_cnt, max_seq_length_batch), dtype=np.float32
            )
            time_to_now_batch = np.zeros(
                (instance_cnt, max_seq_length_batch), dtype=np.float32
            )
            mask = np.zeros((instance_cnt, max_seq_length_batch), dtype=np.float32)

            for i in range(instance_cnt):
                this_length = min(history_lengths[i], max_seq_length_batch)
                item_history_batch_all[i, -this_length:] = item_history_batch[i][
                    -this_length:
                ]
                item_cate_history_batch_all[i, -this_length:] = item_cate_history_batch[
                    i
                ][-this_length:]
                mask[i, -this_length:] = 1.0
                time_diff_batch[i, -this_length:] = time_diff_list[i][-this_length:]
                time_from_first_action_batch[
                    i, -this_length:
                ] = time_from_first_action_list[i][-this_length:]
                time_to_now_batch[i, -this_length:] = time_to_now_list[i][-this_length:]

            res = {}
            res["labels"] = np.asarray(label_list, dtype=np.float32).reshape([-1, 1])
            res["users"] = np.asarray(user_list, dtype=np.float32)
            res["items"] = np.asarray(item_list, dtype=np.int32).reshape([-1, 1])
            res["cates"] = np.asarray(item_cate_list, dtype=np.int32).reshape([-1, 1])
            res["item_history"] = item_history_batch_all
            res["item_cate_history"] = item_cate_history_batch_all
            res["mask"] = mask
            res["time"] = np.asarray(time_list, dtype=np.float32)
            res["time_diff"] = time_diff_batch
            res["time_from_first_action"] = time_from_first_action_batch
            res["time_to_now"] = time_to_now_batch
            return res