Source code for recommenders.models.deeprec.io.dkn_iterator

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.

import tensorflow as tf
import numpy as np

from recommenders.models.deeprec.io.iterator import BaseIterator


__all__ = ["DKNTextIterator"]


[docs]class DKNTextIterator(BaseIterator): """Data loader for the DKN model. DKN requires a special type of data format, where each instance contains a label, the candidate news article, and user's clicked news article. Articles are represented by title words and title entities. Words and entities are aligned. Iterator will not load the whole data into memory. Instead, it loads data into memory per mini-batch, so that large files can be used as input data. """
[docs] def __init__(self, hparams, graph, col_spliter=" ", ID_spliter="%"): """Initialize an iterator. Create necessary placeholders for the model. Args: hparams (object): Global hyper-parameters. Some key setttings such as #_feature and #_field are there. graph (object): the running graph. All created placeholder will be added to this graph. col_spliter (str): column spliter in one line. ID_spliter (str): ID spliter in one line. """ self.col_spliter = col_spliter self.ID_spliter = ID_spliter self.batch_size = hparams.batch_size self.doc_size = hparams.doc_size self.history_size = hparams.history_size self.graph = graph with self.graph.as_default(): self.labels = tf.compat.v1.placeholder(tf.float32, [None, 1], name="label") self.candidate_news_index_batch = tf.compat.v1.placeholder( tf.int64, [self.batch_size, self.doc_size], name="candidate_news_index" ) self.click_news_index_batch = tf.compat.v1.placeholder( tf.int64, [self.batch_size, self.history_size, self.doc_size], name="click_news_index", ) self.candidate_news_entity_index_batch = tf.compat.v1.placeholder( tf.int64, [self.batch_size, self.doc_size], name="candidate_news_entity_index", ) self.click_news_entity_index_batch = tf.compat.v1.placeholder( tf.int64, [self.batch_size, self.history_size, self.doc_size], name="click_news_entity_index", ) self.news_word_index = {} self.news_entity_index = {} with tf.io.gfile.GFile(hparams.news_feature_file, "r") as rd: for line in rd: newsid, word_index, entity_index = line.strip().split(col_spliter) self.news_word_index[newsid] = [ int(item) for item in word_index.split(",") ] self.news_entity_index[newsid] = [ int(item) for item in entity_index.split(",") ] self.user_history = {} with tf.io.gfile.GFile(hparams.user_history_file, "r") as rd: for line in rd: if len(line.strip().split(col_spliter)) == 1: userid = line.strip() user_history = [] else: userid, user_history_string = line.strip().split(col_spliter) user_history = user_history_string.split(",") click_news_index = [] click_news_entity_index = [] if len(user_history) > self.history_size: user_history = user_history[-self.history_size :] for newsid in user_history: click_news_index.append(self.news_word_index[newsid]) click_news_entity_index.append(self.news_entity_index[newsid]) for i in range(self.history_size - len(user_history)): click_news_index.append(np.zeros(self.doc_size)) click_news_entity_index.append(np.zeros(self.doc_size)) self.user_history[userid] = (click_news_index, click_news_entity_index)
[docs] def parser_one_line(self, line): """Parse one string line into feature values. Args: line (str): a string indicating one instance Returns: list: Parsed results including `label`, `candidate_news_index`, `click_news_index`, `candidate_news_entity_index`, `click_news_entity_index`, `impression_id`. """ impression_id = 0 words = line.strip().split(self.ID_spliter) if len(words) == 2: impression_id = words[1].strip() cols = words[0].strip().split(self.col_spliter) label = float(cols[0]) userid = cols[1] candidate_news = cols[2] candidate_news_index = self.news_word_index[candidate_news] candidate_news_entity_index = self.news_entity_index[candidate_news] click_news_index = self.user_history[userid][0] click_news_entity_index = self.user_history[userid][1] return ( label, candidate_news_index, click_news_index, candidate_news_entity_index, click_news_entity_index, impression_id, )
[docs] def load_data_from_file(self, infile): """Read and parse data from a file. Args: infile (str): text input file. Each line in this file is an instance. Yields: obj, list, int: - An iterator that yields parsed results, in the format of graph `feed_dict`. - Impression id list. - Size of the data in a batch. """ candidate_news_index_batch = [] click_news_index_batch = [] candidate_news_entity_index_batch = [] click_news_entity_index_batch = [] label_list = [] impression_id_list = [] cnt = 0 with tf.io.gfile.GFile(infile, "r") as rd: for line in rd: ( label, candidate_news_index, click_news_index, candidate_news_entity_index, click_news_entity_index, impression_id, ) = self.parser_one_line(line) candidate_news_index_batch.append(candidate_news_index) click_news_index_batch.append(click_news_index) candidate_news_entity_index_batch.append(candidate_news_entity_index) click_news_entity_index_batch.append(click_news_entity_index) label_list.append(label) impression_id_list.append(impression_id) cnt += 1 if cnt >= self.batch_size: res = self._convert_data( label_list, candidate_news_index_batch, click_news_index_batch, candidate_news_entity_index_batch, click_news_entity_index_batch, impression_id_list, ) data_size = self.batch_size yield self.gen_feed_dict(res), impression_id_list, data_size candidate_news_index_batch = [] click_news_index_batch = [] candidate_news_entity_index_batch = [] click_news_entity_index_batch = [] label_list = [] impression_id_list = [] cnt = 0 if cnt > 0: data_size = cnt while cnt < self.batch_size: candidate_news_index_batch.append( candidate_news_index_batch[cnt % data_size] ) click_news_index_batch.append( click_news_index_batch[cnt % data_size] ) candidate_news_entity_index_batch.append( candidate_news_entity_index_batch[cnt % data_size] ) click_news_entity_index_batch.append( click_news_entity_index_batch[cnt % data_size] ) label_list.append(label_list[cnt % data_size]) impression_id_list.append(impression_id_list[cnt % data_size]) cnt += 1 res = self._convert_data( label_list, candidate_news_index_batch, click_news_index_batch, candidate_news_entity_index_batch, click_news_entity_index_batch, impression_id_list, ) yield self.gen_feed_dict(res), impression_id_list, data_size
[docs] def load_infer_data_from_file(self, infile): """Read and parse data from a file for infer document embedding. Args: infile (str): text input file. Each line in this file is an instance. Yields: obj, list, int: - An iterator that yields parsed results, in the format of graph `feed_dict`. - Impression id list. - Size of the data in a batch. """ newsid_list = [] candidate_news_index_batch = [] candidate_news_entity_index_batch = [] cnt = 0 with tf.io.gfile.GFile(infile, "r") as rd: for line in rd: newsid, word_index, entity_index = line.strip().split(" ") newsid_list.append(newsid) candidate_news_index = [] candidate_news_entity_index = [] for item in word_index.split(","): candidate_news_index.append(int(item)) for item in entity_index.split(","): candidate_news_entity_index.append(int(item)) candidate_news_index_batch.append(candidate_news_index) candidate_news_entity_index_batch.append(candidate_news_entity_index) cnt += 1 if cnt >= self.batch_size: res = self._convert_infer_data( candidate_news_index_batch, candidate_news_entity_index_batch ) data_size = self.batch_size yield self.gen_infer_feed_dict(res), newsid_list, data_size candidate_news_index_batch = [] candidate_news_entity_index_batch = [] newsid_list = [] cnt = 0 if cnt > 0: data_size = cnt while cnt < self.batch_size: candidate_news_index_batch.append( candidate_news_index_batch[cnt % data_size] ) candidate_news_entity_index_batch.append( candidate_news_entity_index_batch[cnt % data_size] ) cnt += 1 res = self._convert_infer_data( candidate_news_index_batch, candidate_news_entity_index_batch ) yield self.gen_infer_feed_dict(res), newsid_list, data_size
def _convert_data( self, label_list, candidate_news_index_batch, click_news_index_batch, candidate_news_entity_index_batch, click_news_entity_index_batch, impression_id_list, ): """Convert data into numpy arrays that are good for further model operation. Args: label_list (list): a list of ground-truth labels. candidate_news_index_batch (list): the candidate news article's words indices click_news_index_batch (list): words indices for user's clicked news articles candidate_news_entity_index_batch (list): the candidate news article's entities indices click_news_entity_index_batch (list): the user's clicked news article's entities indices impression_id_list (list) : the session's impression indices Returns: dict: A dictionary, containing multiple numpy arrays that are convenient for further operation. """ res = {} res["labels"] = np.asarray([[label] for label in label_list], dtype=np.float32) res["candidate_news_index_batch"] = np.asarray( candidate_news_index_batch, dtype=np.int64 ) res["click_news_index_batch"] = np.asarray( click_news_index_batch, dtype=np.int64 ) res["candidate_news_entity_index_batch"] = np.asarray( candidate_news_entity_index_batch, dtype=np.int64 ) res["click_news_entity_index_batch"] = np.asarray( click_news_entity_index_batch, dtype=np.int64 ) res["impression_id"] = np.asarray(impression_id_list, dtype=np.int64) return res def _convert_infer_data( self, candidate_news_index_batch, candidate_news_entity_index_batch ): """Convert data into numpy arrays that are good for further model operation. Args: candidate_news_index_batch (list): the candidate news article's words indices candidate_news_entity_index_batch (list): the candidate news article's entities indices Returns: dict: A dictionary, containing multiple numpy arrays that are convenient for further operation. """ res = {} res["candidate_news_index_batch"] = np.asarray( candidate_news_index_batch, dtype=np.int64 ) res["candidate_news_entity_index_batch"] = np.asarray( candidate_news_entity_index_batch, dtype=np.int64 ) return res
[docs] def gen_feed_dict(self, data_dict): """Construct a dictionary that maps graph elements to values. Args: data_dict (dict): a dictionary that maps string name to numpy arrays. Returns: dict: A dictionary that maps graph elements to numpy arrays. """ feed_dict = { self.labels: data_dict["labels"].reshape([-1, 1]), self.candidate_news_index_batch: data_dict[ "candidate_news_index_batch" ].reshape([self.batch_size, self.doc_size]), self.click_news_index_batch: data_dict["click_news_index_batch"].reshape( [self.batch_size, self.history_size, self.doc_size] ), self.candidate_news_entity_index_batch: data_dict[ "candidate_news_entity_index_batch" ].reshape([-1, self.doc_size]), self.click_news_entity_index_batch: data_dict[ "click_news_entity_index_batch" ].reshape([-1, self.history_size, self.doc_size]), } return feed_dict
[docs] def gen_infer_feed_dict(self, data_dict): """Construct a dictionary that maps graph elements to values. Args: data_dict (dict): a dictionary that maps string name to numpy arrays. Returns: dict: A dictionary that maps graph elements to numpy arrays. """ feed_dict = { self.candidate_news_index_batch: data_dict[ "candidate_news_index_batch" ].reshape([self.batch_size, self.doc_size]), self.candidate_news_entity_index_batch: data_dict[ "candidate_news_entity_index_batch" ].reshape([-1, self.doc_size]), } return feed_dict