# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.
import os
import random
import logging
import json
import numpy as np
import re
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
from recommenders.datasets.download_utils import (
maybe_download,
download_path,
unzip_file,
)
URL_MIND_LARGE_TRAIN = (
"https://mind201910small.blob.core.windows.net/release/MINDlarge_train.zip"
)
URL_MIND_LARGE_VALID = (
"https://mind201910small.blob.core.windows.net/release/MINDlarge_dev.zip"
)
URL_MIND_SMALL_TRAIN = (
"https://mind201910small.blob.core.windows.net/release/MINDsmall_train.zip"
)
URL_MIND_SMALL_VALID = (
"https://mind201910small.blob.core.windows.net/release/MINDsmall_dev.zip"
)
URL_MIND_DEMO_TRAIN = (
"https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_train.zip"
)
URL_MIND_DEMO_VALID = (
"https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_dev.zip"
)
URL_MIND_DEMO_UTILS = (
"https://recodatasets.z20.web.core.windows.net/newsrec/MINDdemo_utils.zip"
)
URL_MIND = {
"large": (URL_MIND_LARGE_TRAIN, URL_MIND_LARGE_VALID),
"small": (URL_MIND_SMALL_TRAIN, URL_MIND_SMALL_VALID),
"demo": (URL_MIND_DEMO_TRAIN, URL_MIND_DEMO_VALID),
}
logger = logging.getLogger()
[docs]def download_mind(size="small", dest_path=None):
"""Download MIND dataset
Args:
size (str): Dataset size. One of ["small", "large"]
dest_path (str): Download path. If path is None, it will download the dataset on a temporal path
Returns:
str, str: Path to train and validation sets.
"""
size_options = ["small", "large", "demo"]
if size not in size_options:
raise ValueError(f"Wrong size option, available options are {size_options}")
url_train, url_valid = URL_MIND[size]
with download_path(dest_path) as path:
train_path = maybe_download(url=url_train, work_directory=path)
valid_path = maybe_download(url=url_valid, work_directory=path)
return train_path, valid_path
[docs]def read_clickhistory(path, filename):
"""Read click history file
Args:
path (str): Folder path
filename (str): Filename
Returns:
list, dict:
- A list of user session with user_id, clicks, positive and negative interactions.
- A dictionary with user_id click history.
"""
userid_history = {}
with open(os.path.join(path, filename)) as f:
lines = f.readlines()
sessions = []
for i in range(len(lines)):
_, userid, imp_time, click, imps = lines[i].strip().split("\t")
clicks = click.split(" ")
pos = []
neg = []
imps = imps.split(" ")
for imp in imps:
if imp.split("-")[1] == "1":
pos.append(imp.split("-")[0])
else:
neg.append(imp.split("-")[0])
userid_history[userid] = clicks
sessions.append([userid, clicks, pos, neg])
return sessions, userid_history
def _newsample(nnn, ratio):
if ratio > len(nnn):
return random.sample(nnn * (ratio // len(nnn) + 1), ratio)
else:
return random.sample(nnn, ratio)
[docs]def get_user_history(train_history, valid_history, user_history_path):
"""Generate user history file.
Args:
train_history (list): Train history.
valid_history (list): Validation history
user_history_path (str): Path to file.
"""
fp_user_history = open(user_history_path, "w", encoding="utf-8")
for userid in train_history:
fp_user_history.write(
"train_" + userid + " " + ",".join(train_history[userid]) + "\n"
)
for userid in valid_history:
fp_user_history.write(
"valid_" + userid + " " + ",".join(valid_history[userid]) + "\n"
)
fp_user_history.close()
if os.path.isfile(user_history_path):
logger.info(f"User history file {user_history_path} successfully generated")
else:
raise FileNotFoundError(f"Error when generating {user_history_path}")
def _read_news(filepath, news_words, news_entities, tokenizer):
with open(filepath, encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
splitted = line.strip("\n").split("\t")
news_words[splitted[0]] = tokenizer.tokenize(splitted[3].lower())
news_entities[splitted[0]] = []
for entity in json.loads(splitted[6]):
news_entities[splitted[0]].append(
(entity["SurfaceForms"], entity["WikidataId"])
)
return news_words, news_entities
[docs]def get_words_and_entities(train_news, valid_news):
"""Load words and entities
Args:
train_news (str): News train file.
valid_news (str): News validation file.
Returns:
dict, dict: Words and entities dictionaries.
"""
news_words = {}
news_entities = {}
tokenizer = RegexpTokenizer(r"\w+")
news_words, news_entities = _read_news(
train_news, news_words, news_entities, tokenizer
)
news_words, news_entities = _read_news(
valid_news, news_words, news_entities, tokenizer
)
return news_words, news_entities
[docs]def download_and_extract_glove(dest_path):
"""Download and extract the Glove embedding
Args:
dest_path (str): Destination directory path for the downloaded file
Returns:
str: File path where Glove was extracted.
"""
# url = "http://nlp.stanford.edu/data/glove.6B.zip"
url = "https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip"
filepath = maybe_download(url=url, work_directory=dest_path)
glove_path = os.path.join(dest_path, "glove")
unzip_file(filepath, glove_path, clean_zip_file=False)
return glove_path
[docs]def generate_embeddings(
data_path,
news_words,
news_entities,
train_entities,
valid_entities,
max_sentence=10,
word_embedding_dim=100,
):
"""Generate embeddings.
Args:
data_path (str): Data path.
news_words (dict): News word dictionary.
news_entities (dict): News entity dictionary.
train_entities (str): Train entity file.
valid_entities (str): Validation entity file.
max_sentence (int): Max sentence size.
word_embedding_dim (int): Word embedding dimension.
Returns:
str, str, str: File paths to news, word and entity embeddings.
"""
embedding_dimensions = [50, 100, 200, 300]
if word_embedding_dim not in embedding_dimensions:
raise ValueError(
f"Wrong embedding dimension, available options are {embedding_dimensions}"
)
logger.info("Downloading glove...")
glove_path = download_and_extract_glove(data_path)
word_set = set()
word_embedding_dict = {}
entity_embedding_dict = {}
logger.info(f"Loading glove with embedding dimension {word_embedding_dim}...")
glove_file = "glove.6B." + str(word_embedding_dim) + "d.txt"
fp_pretrain_vec = open(os.path.join(glove_path, glove_file), "r", encoding="utf-8")
for line in fp_pretrain_vec:
linesplit = line.split(" ")
word_set.add(linesplit[0])
word_embedding_dict[linesplit[0]] = np.asarray(list(map(float, linesplit[1:])))
fp_pretrain_vec.close()
logger.info("Reading train entities...")
fp_entity_vec_train = open(train_entities, "r", encoding="utf-8")
for line in fp_entity_vec_train:
linesplit = line.split()
entity_embedding_dict[linesplit[0]] = np.asarray(
list(map(float, linesplit[1:]))
)
fp_entity_vec_train.close()
logger.info("Reading valid entities...")
fp_entity_vec_valid = open(valid_entities, "r", encoding="utf-8")
for line in fp_entity_vec_valid:
linesplit = line.split()
entity_embedding_dict[linesplit[0]] = np.asarray(
list(map(float, linesplit[1:]))
)
fp_entity_vec_valid.close()
logger.info("Generating word and entity indexes...")
word_dict = {}
word_index = 1
news_word_string_dict = {}
news_entity_string_dict = {}
entity2index = {}
entity_index = 1
for doc_id in news_words:
news_word_string_dict[doc_id] = [0 for n in range(max_sentence)]
news_entity_string_dict[doc_id] = [0 for n in range(max_sentence)]
surfaceform_entityids = news_entities[doc_id]
for item in surfaceform_entityids:
if item[1] not in entity2index and item[1] in entity_embedding_dict:
entity2index[item[1]] = entity_index
entity_index = entity_index + 1
for i in range(len(news_words[doc_id])):
if news_words[doc_id][i] in word_embedding_dict:
if news_words[doc_id][i] not in word_dict:
word_dict[news_words[doc_id][i]] = word_index
word_index = word_index + 1
news_word_string_dict[doc_id][i] = word_dict[news_words[doc_id][i]]
else:
news_word_string_dict[doc_id][i] = word_dict[news_words[doc_id][i]]
for item in surfaceform_entityids:
for surface in item[0]:
for surface_word in surface.split(" "):
if news_words[doc_id][i] == surface_word.lower():
if item[1] in entity_embedding_dict:
news_entity_string_dict[doc_id][i] = entity2index[
item[1]
]
if i == max_sentence - 1:
break
logger.info("Generating word embeddings...")
word_embeddings = np.zeros([word_index, word_embedding_dim])
for word in word_dict:
word_embeddings[word_dict[word]] = word_embedding_dict[word]
logger.info("Generating entity embeddings...")
entity_embeddings = np.zeros([entity_index, word_embedding_dim])
for entity in entity2index:
entity_embeddings[entity2index[entity]] = entity_embedding_dict[entity]
news_feature_path = os.path.join(data_path, "doc_feature.txt")
logger.info(f"Saving word and entity features in {news_feature_path}")
fp_doc_string = open(news_feature_path, "w", encoding="utf-8")
for doc_id in news_word_string_dict:
fp_doc_string.write(
doc_id
+ " "
+ ",".join(list(map(str, news_word_string_dict[doc_id])))
+ " "
+ ",".join(list(map(str, news_entity_string_dict[doc_id])))
+ "\n"
)
word_embeddings_path = os.path.join(
data_path, "word_embeddings_5w_" + str(word_embedding_dim) + ".npy"
)
logger.info(f"Saving word embeddings in {word_embeddings_path}")
np.save(word_embeddings_path, word_embeddings)
entity_embeddings_path = os.path.join(
data_path, "entity_embeddings_5w_" + str(word_embedding_dim) + ".npy"
)
logger.info(f"Saving word embeddings in {entity_embeddings_path}")
np.save(entity_embeddings_path, entity_embeddings)
return news_feature_path, word_embeddings_path, entity_embeddings_path
[docs]def load_glove_matrix(path_emb, word_dict, word_embedding_dim):
"""Load pretrained embedding metrics of words in word_dict
Args:
path_emb (string): Folder path of downloaded glove file
word_dict (dict): word dictionary
word_embedding_dim: dimention of word embedding vectors
Returns:
numpy.ndarray, list: pretrained word embedding metrics, words can be found in glove files
"""
embedding_matrix = np.zeros((len(word_dict) + 1, word_embedding_dim))
exist_word = []
with open(os.path.join(path_emb, f"glove.6B.{word_embedding_dim}d.txt"), "rb") as f:
for l in tqdm(f): # noqa: E741 ambiguous variable name 'l'
l = l.split() # noqa: E741 ambiguous variable name 'l'
word = l[0].decode()
if len(word) != 0:
if word in word_dict:
wordvec = [float(x) for x in l[1:]]
index = word_dict[word]
embedding_matrix[index] = np.array(wordvec)
exist_word.append(word)
return embedding_matrix, exist_word
[docs]def word_tokenize(sent):
"""Tokenize a sententence
Args:
sent: the sentence need to be tokenized
Returns:
list: words in the sentence
"""
# treat consecutive words or special punctuation as words
pat = re.compile(r"[\w]+|[.,!?;|]")
if isinstance(sent, str):
return pat.findall(sent.lower())
else:
return []