Source code for recommenders.models.newsrec.newsrec_utils

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.


from recommenders.models.deeprec.deeprec_utils import (
    flat_config,
    HParams,
    load_yaml,
)
import random
import re


[docs]def check_type(config): """Check that the config parameters are the correct type Args: config (dict): Configuration dictionary. Raises: TypeError: If the parameters are not the correct type. """ int_parameters = [ "word_size", "his_size", "title_size", "body_size", "npratio", "word_emb_dim", "attention_hidden_dim", "epochs", "batch_size", "show_step", "save_epoch", "head_num", "head_dim", "user_num", "filter_num", "window_size", "gru_unit", "user_emb_dim", "vert_emb_dim", "subvert_emb_dim", ] for param in int_parameters: if param in config and not isinstance(config[param], int): raise TypeError("Parameters {0} must be int".format(param)) float_parameters = ["learning_rate", "dropout"] for param in float_parameters: if param in config and not isinstance(config[param], float): raise TypeError("Parameters {0} must be float".format(param)) str_parameters = [ "wordEmb_file", "wordDict_file", "userDict_file", "vertDict_file", "subvertDict_file", "method", "loss", "optimizer", "cnn_activation", "dense_activation" "type", ] for param in str_parameters: if param in config and not isinstance(config[param], str): raise TypeError("Parameters {0} must be str".format(param)) list_parameters = ["layer_sizes", "activation"] for param in list_parameters: if param in config and not isinstance(config[param], list): raise TypeError("Parameters {0} must be list".format(param)) bool_parameters = ["support_quick_scoring"] for param in bool_parameters: if param in config and not isinstance(config[param], bool): raise TypeError("Parameters {0} must be bool".format(param))
[docs]def check_nn_config(f_config): """Check neural networks configuration. Args: f_config (dict): Neural network configuration. Raises: ValueError: If the parameters are not correct. """ if f_config["model_type"] in ["nrms", "NRMS"]: required_parameters = [ "title_size", "his_size", "wordEmb_file", "wordDict_file", "userDict_file", "npratio", "data_format", "word_emb_dim", # nrms "head_num", "head_dim", # attention "attention_hidden_dim", "loss", "data_format", "dropout", ] elif f_config["model_type"] in ["naml", "NAML"]: required_parameters = [ "title_size", "body_size", "his_size", "wordEmb_file", "subvertDict_file", "vertDict_file", "wordDict_file", "userDict_file", "npratio", "data_format", "word_emb_dim", "vert_emb_dim", "subvert_emb_dim", # naml "filter_num", "cnn_activation", "window_size", "dense_activation", # attention "attention_hidden_dim", "loss", "data_format", "dropout", ] elif f_config["model_type"] in ["lstur", "LSTUR"]: required_parameters = [ "title_size", "his_size", "wordEmb_file", "wordDict_file", "userDict_file", "npratio", "data_format", "word_emb_dim", # lstur "gru_unit", "type", "filter_num", "cnn_activation", "window_size", # attention "attention_hidden_dim", "loss", "data_format", "dropout", ] elif f_config["model_type"] in ["npa", "NPA"]: required_parameters = [ "title_size", "his_size", "wordEmb_file", "wordDict_file", "userDict_file", "npratio", "data_format", "word_emb_dim", # npa "user_emb_dim", "filter_num", "cnn_activation", "window_size", # attention "attention_hidden_dim", "loss", "data_format", "dropout", ] else: required_parameters = [] # check required parameters for param in required_parameters: if param not in f_config: raise ValueError("Parameters {0} must be set".format(param)) if f_config["model_type"] in ["nrms", "NRMS", "lstur", "LSTUR"]: if f_config["data_format"] != "news": raise ValueError( "For nrms and naml model, data format must be 'news', but your set is {0}".format( f_config["data_format"] ) ) elif f_config["model_type"] in ["naml", "NAML"]: if f_config["data_format"] != "naml": raise ValueError( "For nrms and naml model, data format must be 'naml', but your set is {0}".format( f_config["data_format"] ) ) check_type(f_config)
[docs]def create_hparams(flags): """Create the model hyperparameters. Args: flags (dict): Dictionary with the model requirements. Returns: HParams: Hyperparameter object. """ init_dict = { # data "support_quick_scoring": False, # models "dropout": 0.0, "attention_hidden_dim": 200, # nrms "head_num": 4, "head_dim": 100, # naml "filter_num": 200, "window_size": 3, "vert_emb_dim": 100, "subvert_emb_dim": 100, # lstur "gru_unit": 400, "type": "ini", # npa "user_emb_dim": 50, # train "learning_rate": 0.001, "optimizer": "adam", "epochs": 10, "batch_size": 1, # show info "show_step": 1, } init_dict.update(flags) return HParams(init_dict)
[docs]def prepare_hparams(yaml_file=None, **kwargs): """Prepare the model hyperparameters and check that all have the correct value. Args: yaml_file (str): YAML file as configuration. Returns: HParams: Hyperparameter object. """ if yaml_file is not None: config = load_yaml(yaml_file) config = flat_config(config) else: config = {} config.update(kwargs) check_nn_config(config) return create_hparams(config)
[docs]def word_tokenize(sent): """Split sentence into word list using regex. Args: sent (str): Input sentence Return: list: word list """ pat = re.compile(r"[\w]+|[.,!?;|]") if isinstance(sent, str): return pat.findall(sent.lower()) else: return []
[docs]def newsample(news, ratio): """Sample ratio samples from news list. If length of news is less than ratio, pad zeros. Args: news (list): input news list ratio (int): sample number Returns: list: output of sample list. """ if ratio > len(news): return news + [0] * (ratio - len(news)) else: return random.sample(news, ratio)
[docs]def get_mind_data_set(type): """Get MIND dataset address Args: type (str): type of mind dataset, must be in ['large', 'small', 'demo'] Returns: list: data url and train valid dataset name """ assert type in ["large", "small", "demo"] if type == "large": return ( "https://mind201910small.blob.core.windows.net/release/", "MINDlarge_train.zip", "MINDlarge_dev.zip", "MINDlarge_utils.zip", ) elif type == "small": return ( "https://mind201910small.blob.core.windows.net/release/", "MINDsmall_train.zip", "MINDsmall_dev.zip", "MINDsmall_utils.zip", ) elif type == "demo": return ( "https://recodatasets.z20.web.core.windows.net/newsrec/", "MINDdemo_train.zip", "MINDdemo_dev.zip", "MINDdemo_utils.zip", )