Source code for recommenders.models.tfidf.tfidf_utils

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from transformers import BertTokenizer
import re
import unicodedata
import pandas as pd
import numpy as np

import nltk
from nltk.stem.porter import PorterStemmer


[docs]class TfidfRecommender: """Term Frequency - Inverse Document Frequency (TF-IDF) Recommender This class provides content-based recommendations using TF-IDF vectorization in combination with cosine similarity. """
[docs] def __init__(self, id_col, tokenization_method="scibert"): """Initialize model parameters Args: id_col (str): Name of column containing item IDs. tokenization_method (str): ['none','nltk','bert','scibert'] option for tokenization method. """ self.id_col = id_col if tokenization_method.lower() not in ["none", "nltk", "bert", "scibert"]: raise ValueError( 'Tokenization method must be one of ["none" | "nltk" | "bert" | "scibert"]' ) self.tokenization_method = tokenization_method.lower() # Initialize other variables used in this class self.tf = TfidfVectorizer() self.tfidf_matrix = dict() self.tokens = dict() self.stop_words = frozenset() self.recommendations = dict() self.top_k_recommendations = pd.DataFrame()
def __clean_text(self, text, for_BERT=False, verbose=False): """Clean text by removing HTML tags, symbols, and punctuation. Args: text (str): Text to clean. for_BERT (boolean): True or False for if this text is being cleaned for a BERT word tokenization method. verbose (boolean): True or False for whether to print. Returns: str: Cleaned version of text. """ try: # Normalize unicode text_norm = unicodedata.normalize("NFC", text) # Remove HTML tags clean = re.sub("<.*?>", "", text_norm) # Remove new line and tabs clean = clean.replace("\n", " ") clean = clean.replace("\t", " ") clean = clean.replace("\r", " ") clean = clean.replace(\xa0", "") # non-breaking space # Remove all punctuation and special characters clean = re.sub( r"([^\s\w]|_)+", "", clean ) # noqa W695 invalid escape sequence '\s' # If you want to keep some punctuation, see below commented out example # clean = re.sub(r'([^\s\w\-\_\(\)]|_)+','', clean) # Skip further processing if the text will be used in BERT tokenization if for_BERT is False: # Lower case clean = clean.lower() except Exception: if verbose is True: print("Cannot clean non-existent text") clean = "" return clean
[docs] def clean_dataframe(self, df, cols_to_clean, new_col_name="cleaned_text"): """Clean the text within the columns of interest and return a dataframe with cleaned and combined text. Args: df (pandas.DataFrame): Dataframe containing the text content to clean. cols_to_clean (list of str): List of columns to clean by name (e.g., ['abstract','full_text']). new_col_name (str): Name of the new column that will contain the cleaned text. Returns: pandas.DataFrame: Dataframe with cleaned text in the new column. """ # Collapse the table such that all descriptive text is just in a single column df = df.replace(np.nan, "", regex=True) df[new_col_name] = df[cols_to_clean].apply(lambda cols: " ".join(cols), axis=1) # Check if for BERT tokenization if self.tokenization_method in ["bert", "scibert"]: for_BERT = True else: for_BERT = False # Clean the text in the dataframe df[new_col_name] = df[new_col_name].map( lambda x: self.__clean_text(x, for_BERT) ) return df
[docs] def tokenize_text( self, df_clean, text_col="cleaned_text", ngram_range=(1, 3), min_df=0 ): """Tokenize the input text. For more details on the TfidfVectorizer, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html Args: df_clean (pandas.DataFrame): Dataframe with cleaned text in the new column. text_col (str): Name of column containing the cleaned text. ngram_range (tuple of int): The lower and upper boundary of the range of n-values for different n-grams to be extracted. min_df (int): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. Returns: TfidfVectorizer, pandas.Series: - Scikit-learn TfidfVectorizer object defined in `.tokenize_text()`. - Each row contains tokens for respective documents separated by spaces. """ vectors = df_clean[text_col] # If a HuggingFace BERT word tokenization method if self.tokenization_method in ["bert", "scibert"]: # Set vectorizer tf = TfidfVectorizer( analyzer="word", ngram_range=ngram_range, min_df=min_df, stop_words="english", ) # Get appropriate transformer name if self.tokenization_method == "bert": bert_method = "bert-base-cased" elif self.tokenization_method == "scibert": bert_method = "allenai/scibert_scivocab_cased" # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained(bert_method) # Loop through each item vectors_tokenized = vectors.copy() for i in range(0, len(vectors)): vectors_tokenized[i] = " ".join(tokenizer.tokenize(vectors[i])) elif self.tokenization_method == "nltk": # NLTK Stemming token_dict = {} # noqa: F841 stemmer = PorterStemmer() def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed def tokenize(text): tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens, stemmer) return stems # When defining a custome tokenizer with TfidfVectorizer, the tokenization is applied in the fit function tf = TfidfVectorizer( tokenizer=tokenize, analyzer="word", ngram_range=ngram_range, min_df=min_df, stop_words="english", ) vectors_tokenized = vectors elif self.tokenization_method == "none": # No tokenization applied tf = TfidfVectorizer( analyzer="word", ngram_range=ngram_range, min_df=min_df, stop_words="english", ) vectors_tokenized = vectors # Save to class variable self.tf = tf return tf, vectors_tokenized
[docs] def fit(self, tf, vectors_tokenized): """Fit TF-IDF vectorizer to the cleaned and tokenized text. Args: tf (TfidfVectorizer): sklearn.feature_extraction.text.TfidfVectorizer object defined in .tokenize_text(). vectors_tokenized (pandas.Series): Each row contains tokens for respective documents separated by spaces. """ self.tfidf_matrix = tf.fit_transform(vectors_tokenized)
[docs] def get_tokens(self): """Return the tokens generated by the TF-IDF vectorizer. Returns: dict: Dictionary of tokens generated by the TF-IDF vectorizer. """ try: self.tokens = self.tf.vocabulary_ except Exception: self.tokens = "Run .tokenize_text() and .fit_tfidf() first" return self.tokens
[docs] def get_stop_words(self): """Return the stop words excluded in the TF-IDF vectorizer. Returns: list: Frozenset of stop words used by the TF-IDF vectorizer (can be converted to list). """ try: self.stop_words = self.tf.get_stop_words() except Exception: self.stop_words = "Run .tokenize_text() and .fit_tfidf() first" return self.stop_words
def __create_full_recommendation_dictionary(self, df_clean): """Create the full recommendation dictionary containing all recommendations for all items. Args: pandas.DataFrame: Dataframe with cleaned text. """ # Similarity measure cosine_sim = linear_kernel(self.tfidf_matrix, self.tfidf_matrix) # sorted_idx has the indices that would sort the array. sorted_idx = np.argsort(cosine_sim, axis=1) data = list(df_clean[self.id_col].values) len_df_clean = len(df_clean) results = {} for idx, row in zip(range(0, len_df_clean), data): similar_indices = sorted_idx[idx][: -(len_df_clean + 1) : -1] similar_items = [(cosine_sim[idx][i], data[i]) for i in similar_indices] results[row] = similar_items[1:] # Save to class self.recommendations = results def __organize_results_as_tabular(self, df_clean, k): """Restructures results dictionary into a table containing only the top k recommendations per item. Args: df_clean (pandas.DataFrame): Dataframe with cleaned text. k (int): Number of recommendations to return. """ # Initialize new dataframe to hold recommendation output item_id = list() rec_rank = list() rec_score = list() rec_item_id = list() # For each item for _item_id in self.recommendations: # Information about the item we are basing recommendations off of rec_based_on = tmp_item_id = _item_id # Get all scores and IDs for items recommended for this current item rec_array = self.recommendations.get(rec_based_on) tmp_rec_score = list(map(lambda x: x[0], rec_array)) tmp_rec_id = list(map(lambda x: x[1], rec_array)) # Append multiple values at a time to list item_id.extend([tmp_item_id] * k) rec_rank.extend(list(range(1, k + 1))) rec_score.extend(tmp_rec_score[:k]) rec_item_id.extend(tmp_rec_id[:k]) # Save the output output_dict = { self.id_col: item_id, "rec_rank": rec_rank, "rec_score": rec_score, "rec_" + self.id_col: rec_item_id, } # Convert to dataframe self.top_k_recommendations = pd.DataFrame(output_dict)
[docs] def recommend_top_k_items(self, df_clean, k=5): """Recommend k number of items similar to the item of interest. Args: df_clean (pandas.DataFrame): Dataframe with cleaned text. k (int): Number of recommendations to return. Returns: pandas.DataFrame: Dataframe containing id of top k recommendations for all items. """ if k > len(df_clean) - 1: raise ValueError( "Cannot get more recommendations than there are items. Set k lower." ) self.__create_full_recommendation_dictionary(df_clean) self.__organize_results_as_tabular(df_clean, k) return self.top_k_recommendations
def __get_single_item_info(self, metadata, rec_id): """Get full information for a single recommended item. Args: metadata (pandas.DataFrame): Dataframe containing item info. rec_id (str): Identifier for recommended item. Returns: pandas.Series: Single row from dataframe containing recommended item info. """ # Return row rec_info = metadata.iloc[int(np.where(metadata[self.id_col] == rec_id)[0])] return rec_info def __make_clickable(self, address): """Make URL clickable. Args: address (str): URL address to make clickable. """ return '<a href="{0}">{0}</a>'.format(address)
[docs] def get_top_k_recommendations( self, metadata, query_id, cols_to_keep=[], verbose=True ): """Return the top k recommendations with useful metadata for each recommendation. Args: metadata (pandas.DataFrame): Dataframe holding metadata for all public domain papers. query_id (str): ID of item of interest. cols_to_keep (list of str): List of columns from the metadata dataframe to include (e.g., ['title','authors','journal','publish_time','url']). By default, all columns are kept. verbose (boolean): Set to True if you want to print the table. Returns: pandas.Styler: Stylized dataframe holding recommendations and associated metadata just for the item of interest (can access as normal dataframe by using df.data). """ # Create subset of dataframe with just recommendations for the item of interest df = self.top_k_recommendations.loc[ self.top_k_recommendations[self.id_col] == query_id ].reset_index() # Remove id_col of query item df.drop([self.id_col], axis=1, inplace=True) # Add metadata for each recommended item (rec_<id_col>) metadata_cols = metadata.columns.values df[metadata_cols] = df.apply( lambda row: self.__get_single_item_info( metadata, row["rec_" + self.id_col] ), axis=1, ) # Remove id col added from metadata (already present from self.top_k_recommendations) df.drop([self.id_col], axis=1, inplace=True) # Rename columns such that rec_ is no longer appended, for simplicity df = df.rename(columns={"rec_rank": "rank", "rec_score": "similarity_score"}) # Only keep columns of interest if len(cols_to_keep) > 0: # Insert our recommendation scoring/ranking columns cols_to_keep.insert(0, "similarity_score") cols_to_keep.insert(0, "rank") df = df[cols_to_keep] # Make URLs clickable if they exist if "url" in list(map(lambda x: x.lower(), metadata_cols)): format_ = {"url": self.__make_clickable} df = df.head().style.format(format_) if verbose: df return df