Source code for recommenders.models.tfidf.tfidf_utils

# Copyright (c) Recommenders contributors.
# Licensed under the MIT License.


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from transformers import BertTokenizer
import re
import unicodedata
import pandas as pd
import numpy as np

import nltk
from nltk.stem.porter import PorterStemmer


[docs]class TfidfRecommender:
    """Term Frequency - Inverse Document Frequency (TF-IDF) Recommender

    This class provides content-based recommendations using TF-IDF vectorization in combination with cosine similarity.
    """

[docs]    def __init__(self, id_col, tokenization_method="scibert"):
        """Initialize model parameters

        Args:
            id_col (str): Name of column containing item IDs.
            tokenization_method (str): ['none','nltk','bert','scibert'] option for tokenization method.
        """
        self.id_col = id_col
        if tokenization_method.lower() not in ["none", "nltk", "bert", "scibert"]:
            raise ValueError(
                'Tokenization method must be one of ["none" | "nltk" | "bert" | "scibert"]'
            )
        self.tokenization_method = tokenization_method.lower()

        # Initialize other variables used in this class
        self.tf = TfidfVectorizer()
        self.tfidf_matrix = dict()
        self.tokens = dict()
        self.stop_words = frozenset()
        self.recommendations = dict()
        self.top_k_recommendations = pd.DataFrame()

    def __clean_text(self, text, for_BERT=False, verbose=False):
        """Clean text by removing HTML tags, symbols, and punctuation.

        Args:
            text (str): Text to clean.
            for_BERT (boolean): True or False for if this text is being cleaned for a BERT word tokenization method.
            verbose (boolean): True or False for whether to print.

        Returns:
            str: Cleaned version of text.
        """

        try:
            # Normalize unicode
            text_norm = unicodedata.normalize("NFC", text)

            # Remove HTML tags
            clean = re.sub("<.*?>", "", text_norm)

            # Remove new line and tabs
            clean = clean.replace("\n", " ")
            clean = clean.replace("\t", " ")
            clean = clean.replace("\r", " ")
            clean = clean.replace("Â\xa0", "")  # non-breaking space

            # Remove all punctuation and special characters
            clean = re.sub(
                r"([^\s\w]|_)+", "", clean
            )  # noqa W695 invalid escape sequence '\s'

            # If you want to keep some punctuation, see below commented out example
            # clean = re.sub(r'([^\s\w\-\_\(\)]|_)+','', clean)

            # Skip further processing if the text will be used in BERT tokenization
            if for_BERT is False:
                # Lower case
                clean = clean.lower()
        except Exception:
            if verbose is True:
                print("Cannot clean non-existent text")
            clean = ""

        return clean

[docs]    def clean_dataframe(self, df, cols_to_clean, new_col_name="cleaned_text"):
        """Clean the text within the columns of interest and return a dataframe with cleaned and combined text.

        Args:
            df (pandas.DataFrame): Dataframe containing the text content to clean.
            cols_to_clean (list of str): List of columns to clean by name (e.g., ['abstract','full_text']).
            new_col_name (str): Name of the new column that will contain the cleaned text.

        Returns:
            pandas.DataFrame: Dataframe with cleaned text in the new column.
        """
        # Collapse the table such that all descriptive text is just in a single column
        df = df.replace(np.nan, "", regex=True)
        df[new_col_name] = df[cols_to_clean].apply(lambda cols: " ".join(cols), axis=1)

        # Check if for BERT tokenization
        if self.tokenization_method in ["bert", "scibert"]:
            for_BERT = True
        else:
            for_BERT = False

        # Clean the text in the dataframe
        df[new_col_name] = df[new_col_name].map(
            lambda x: self.__clean_text(x, for_BERT)
        )

        return df

[docs]    def tokenize_text(
        self, df_clean, text_col="cleaned_text", ngram_range=(1, 3), min_df=0
    ):
        """Tokenize the input text.
        For more details on the TfidfVectorizer, see https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

        Args:
            df_clean (pandas.DataFrame): Dataframe with cleaned text in the new column.
            text_col (str): Name of column containing the cleaned text.
            ngram_range (tuple of int): The lower and upper boundary of the range of n-values for different n-grams to be extracted.
            min_df (int): When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.

        Returns:
            TfidfVectorizer, pandas.Series:
            - Scikit-learn TfidfVectorizer object defined in `.tokenize_text()`.
            - Each row contains tokens for respective documents separated by spaces.
        """
        vectors = df_clean[text_col]

        # If a HuggingFace BERT word tokenization method
        if self.tokenization_method in ["bert", "scibert"]:
            # Set vectorizer
            tf = TfidfVectorizer(
                analyzer="word",
                ngram_range=ngram_range,
                min_df=min_df,
                stop_words="english",
            )

            # Get appropriate transformer name
            if self.tokenization_method == "bert":
                bert_method = "bert-base-cased"
            elif self.tokenization_method == "scibert":
                bert_method = "allenai/scibert_scivocab_cased"

            # Load pre-trained model tokenizer (vocabulary)
            tokenizer = BertTokenizer.from_pretrained(bert_method)

            # Loop through each item
            vectors_tokenized = vectors.copy()
            for i in range(0, len(vectors)):
                vectors_tokenized[i] = " ".join(tokenizer.tokenize(vectors[i]))

        elif self.tokenization_method == "nltk":
            # NLTK Stemming
            token_dict = {}  # noqa: F841
            stemmer = PorterStemmer()

            def stem_tokens(tokens, stemmer):
                stemmed = []
                for item in tokens:
                    stemmed.append(stemmer.stem(item))
                return stemmed

            def tokenize(text):
                tokens = nltk.word_tokenize(text)
                stems = stem_tokens(tokens, stemmer)
                return stems

            # When defining a custome tokenizer with TfidfVectorizer, the tokenization is applied in the fit function
            tf = TfidfVectorizer(
                tokenizer=tokenize,
                analyzer="word",
                ngram_range=ngram_range,
                min_df=min_df,
                stop_words="english",
            )
            vectors_tokenized = vectors

        elif self.tokenization_method == "none":
            # No tokenization applied
            tf = TfidfVectorizer(
                analyzer="word",
                ngram_range=ngram_range,
                min_df=min_df,
                stop_words="english",
            )
            vectors_tokenized = vectors

        # Save to class variable
        self.tf = tf

        return tf, vectors_tokenized

[docs]    def fit(self, tf, vectors_tokenized):
        """Fit TF-IDF vectorizer to the cleaned and tokenized text.

        Args:
            tf (TfidfVectorizer): sklearn.feature_extraction.text.TfidfVectorizer object defined in .tokenize_text().
            vectors_tokenized (pandas.Series): Each row contains tokens for respective documents separated by spaces.
        """
        self.tfidf_matrix = tf.fit_transform(vectors_tokenized)

[docs]    def get_tokens(self):
        """Return the tokens generated by the TF-IDF vectorizer.

        Returns:
            dict: Dictionary of tokens generated by the TF-IDF vectorizer.
        """
        try:
            self.tokens = self.tf.vocabulary_
        except Exception:
            self.tokens = "Run .tokenize_text() and .fit_tfidf() first"
        return self.tokens

[docs]    def get_stop_words(self):
        """Return the stop words excluded in the TF-IDF vectorizer.

        Returns:
            list: Frozenset of stop words used by the TF-IDF vectorizer (can be converted to list).
        """
        try:
            self.stop_words = self.tf.get_stop_words()
        except Exception:
            self.stop_words = "Run .tokenize_text() and .fit_tfidf() first"
        return self.stop_words

    def __create_full_recommendation_dictionary(self, df_clean):
        """Create the full recommendation dictionary containing all recommendations for all items.

        Args:
            pandas.DataFrame: Dataframe with cleaned text.
        """

        # Similarity measure
        cosine_sim = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)

        # sorted_idx has the indices that would sort the array.
        sorted_idx = np.argsort(cosine_sim, axis=1)

        data = list(df_clean[self.id_col].values)
        len_df_clean = len(df_clean)

        results = {}
        for idx, row in zip(range(0, len_df_clean), data):
            similar_indices = sorted_idx[idx][: -(len_df_clean + 1) : -1]
            similar_items = [(cosine_sim[idx][i], data[i]) for i in similar_indices]
            results[row] = similar_items[1:]

        # Save to class
        self.recommendations = results

    def __organize_results_as_tabular(self, df_clean, k):
        """Restructures results dictionary into a table containing only the top k recommendations per item.

        Args:
            df_clean (pandas.DataFrame): Dataframe with cleaned text.
            k (int): Number of recommendations to return.
        """
        # Initialize new dataframe to hold recommendation output
        item_id = list()
        rec_rank = list()
        rec_score = list()
        rec_item_id = list()

        # For each item
        for _item_id in self.recommendations:
            # Information about the item we are basing recommendations off of
            rec_based_on = tmp_item_id = _item_id

            # Get all scores and IDs for items recommended for this current item
            rec_array = self.recommendations.get(rec_based_on)
            tmp_rec_score = list(map(lambda x: x[0], rec_array))
            tmp_rec_id = list(map(lambda x: x[1], rec_array))

            # Append multiple values at a time to list
            item_id.extend([tmp_item_id] * k)
            rec_rank.extend(list(range(1, k + 1)))
            rec_score.extend(tmp_rec_score[:k])
            rec_item_id.extend(tmp_rec_id[:k])

        # Save the output
        output_dict = {
            self.id_col: item_id,
            "rec_rank": rec_rank,
            "rec_score": rec_score,
            "rec_" + self.id_col: rec_item_id,
        }

        # Convert to dataframe
        self.top_k_recommendations = pd.DataFrame(output_dict)

[docs]    def recommend_top_k_items(self, df_clean, k=5):
        """Recommend k number of items similar to the item of interest.

        Args:
            df_clean (pandas.DataFrame): Dataframe with cleaned text.
            k (int): Number of recommendations to return.

        Returns:
            pandas.DataFrame: Dataframe containing id of top k recommendations for all items.
        """
        if k > len(df_clean) - 1:
            raise ValueError(
                "Cannot get more recommendations than there are items. Set k lower."
            )
        self.__create_full_recommendation_dictionary(df_clean)
        self.__organize_results_as_tabular(df_clean, k)

        return self.top_k_recommendations

    def __get_single_item_info(self, metadata, rec_id):
        """Get full information for a single recommended item.

        Args:
            metadata (pandas.DataFrame): Dataframe containing item info.
            rec_id (str): Identifier for recommended item.

        Returns:
            pandas.Series: Single row from dataframe containing recommended item info.
        """

        # Return row
        rec_info = metadata.iloc[int(np.where(metadata[self.id_col] == rec_id)[0])]

        return rec_info

    def __make_clickable(self, address):
        """Make URL clickable.

        Args:
            address (str): URL address to make clickable.
        """
        return '<a href="{0}">{0}</a>'.format(address)

[docs]    def get_top_k_recommendations(
        self, metadata, query_id, cols_to_keep=[], verbose=True
    ):
        """Return the top k recommendations with useful metadata for each recommendation.

        Args:
            metadata (pandas.DataFrame): Dataframe holding metadata for all public domain papers.
            query_id (str): ID of item of interest.
            cols_to_keep (list of str): List of columns from the metadata dataframe to include
                (e.g., ['title','authors','journal','publish_time','url']).
                By default, all columns are kept.
            verbose (boolean): Set to True if you want to print the table.

        Returns:
            pandas.Styler: Stylized dataframe holding recommendations and associated metadata just for the item of interest (can access as normal dataframe by using df.data).
        """

        # Create subset of dataframe with just recommendations for the item of interest
        df = self.top_k_recommendations.loc[
            self.top_k_recommendations[self.id_col] == query_id
        ].reset_index()

        # Remove id_col of query item
        df.drop([self.id_col], axis=1, inplace=True)

        # Add metadata for each recommended item (rec_<id_col>)
        metadata_cols = metadata.columns.values
        df[metadata_cols] = df.apply(
            lambda row: self.__get_single_item_info(
                metadata, row["rec_" + self.id_col]
            ),
            axis=1,
        )

        # Remove id col added from metadata (already present from self.top_k_recommendations)
        df.drop([self.id_col], axis=1, inplace=True)

        # Rename columns such that rec_ is no longer appended, for simplicity
        df = df.rename(columns={"rec_rank": "rank", "rec_score": "similarity_score"})

        # Only keep columns of interest
        if len(cols_to_keep) > 0:
            # Insert our recommendation scoring/ranking columns
            cols_to_keep.insert(0, "similarity_score")
            cols_to_keep.insert(0, "rank")
            df = df[cols_to_keep]

        # Make URLs clickable if they exist
        if "url" in list(map(lambda x: x.lower(), metadata_cols)):
            format_ = {"url": self.__make_clickable}
            df = df.head().style.format(format_)

        if verbose:
            df

        return df