Source code for tprojection.core

##{
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import tprojection.utils as ut
##}

__all__ = ["Tprojection"]

font = {'size'   : 12}

matplotlib.rc('font', **font)

##{
[docs]class Tprojection:
    """
    this class allows to study the relation between the target and a single feature, with the specificity to display a chart type adapted
    to the type of the input variables (categorical or continuous)

    Parameters
    ----------
    df: pandas DataFrame
    target: string
    feature: string
    target_type: string
       can take the values "categorical" or "continuous"
    feature_type: string
       can take the values "categorical" or "continuous"
    target_modality: string
        will be used for multiclass problem (not implemented yet)
    nb_buckets: int (0)
        if > 0, encode feature on nb_buckets dummy modalities if the cardinality is to high
    n_estimators: int (1)
        if > 1, use boostrapping to evaluate estimator variance (only relevant for categorical target and features)
    """

    def __init__(self, df, target, feature,
                 target_type="", feature_type="",
                 target_modality="",
                 nb_buckets=0, n_estimators=1,
                 continuous_threshold=0.05):

        self.df = df.copy()
        self.target = target
        self.feature = feature
        self.target_type =  target_type
        self.feature_type = feature_type
        self.target_modality = target_modality
        self.nb_buckets = nb_buckets
        self.continuous_threshold = continuous_threshold
        self.n_estimators = n_estimators

        self._infer_type()
        self._check_dtype_consistency()
        self._infer_target_modality()
        self._sanitize_target()

    def plot(self):
        self.fig, self.ax1 = plt.subplots()
        self.ax2 = self.ax1.twinx()

        if self.feature_type == "categorical":
            self._cat2all_prep()
            show_boxplot = np.where(self.target_type == "categorical", 0, 1)
            self._cat2all_plot(show_boxplot)
        else:
            if self.target_type == "categorical":
                self._con2bin_plot()
            else:
                self._con2con_plot()

        plt.tight_layout()
        plt.show(block=False)

    def _infer_type(self):
        if self.target_type == "":
            self.target_type = np.where(ut.is_continuous(self.df[self.target], self.continuous_threshold), "continuous", "categorical")
        if self.feature_type == "":
            self.feature_type = np.where(ut.is_continuous(self.df[self.feature], self.continuous_threshold), "continuous", "categorical")

    def _check_dtype_consistency(self):
        def set_dtype(var):
            mydtype = str if getattr(self, var + '_type') == 'categorical' else float
            self.df[getattr(self, var)] = self.df[getattr(self, var)].astype(mydtype).copy()
        set_dtype("feature")
        set_dtype("target")

    def _infer_target_modality(self):
        if self.target_type == "categorical" and self.target_modality == "":
            self.target_modality = self.df[self.target].value_counts().sort_values().index[0]

    def _sanitize_target(self):
        if self.target_type == 'categorical':
            self.df["target_san"] = np.where(self.df[self.target] == self.target_modality, 1, 0)
        else:
            self.df['target_san'] = self.df[self.target]

    def _bootstrap(self, feature):
        dg = pd.DataFrame()
        replace = np.where(self.n_estimators > 1, True, False)
        count = self.df.groupby(feature)[self.target].count()
        for ii in range(self.n_estimators):
            dtmp = self.df.sample(frac=1, replace=replace)
            dg = dg.append(dtmp.groupby(feature)["target_san"].mean())
        dboot= dg.aggregate(["mean", "min", "max"], axis=0)
        dboot.loc["count",:] = count.values
        return dboot.T


    def _cat2all_prep(self):

        if self.nb_buckets:
            self.encoding = ut.get_encoding(self.df, 'target_san', self.feature, self.nb_buckets)
        else:
            self.encoding = {v: v for v in self.df[self.feature].unique()}
        self.df[self.feature + "_encoded"] = self.df[self.feature].map(self.encoding)
        dg = self._bootstrap(self.feature + "_encoded")
        dg['baseline'] = self.df["target_san"].mean()
        dg.sort_values(by="count", ascending=False, inplace=True)
        segment = self.feature + "_encoded"

        self.dg = dg
        self.segment = segment


    def _cat2all_plot(self, show_boxplot=False):
        self.dg["count"].plot(kind="bar", color="blue", ax=self.ax1, alpha=0.5)
        xlabels = self.ax1.get_xticklabels()
        self.ax1.set_xticklabels(xlabels, rotation=45)
        self.dg["mean"].plot(color="red", marker="o", markersize=5, linewidth=2,  ax=self.ax2)
        self.dg["min"].plot(color="red", linewidth=1, linestyle="--", ax=self.ax2)
        self.dg["max"].plot(color="red", linewidth=1, linestyle="--", ax=self.ax2)
        self.ax2.fill_between(self.ax2.get_xticks(), self.dg["min"], self.dg["max"], facecolor="red", alpha=0.2)

        self.dg["baseline"].plot(color="black", linestyle="--", linewidth=2, ax=self.ax2)

        if show_boxplot:
            sns.boxplot(x=self.segment, y=self.target, data=self.df, order=self.dg.index,
                   color="white", boxprops=dict(alpha=0.5))

        self.ax1.set_xlim([-0.6, len(self.dg)-0.5])
        self.ax1.set_xlabel(self.feature)
        self.ax2.set_ylabel(self.target)
        self.ax1.set_ylabel("count")

    def _con2bin_plot(self):
        """
        plot two histograms, one for each class of the target if the target is
        binary and the feature continuous
        """
        pos = self.df.query("target_san == 1")[self.feature]
        neg = self.df.query("target_san == 0")[self.feature]
        lb = np.min([pos.min(), neg.min()])
        ub = np.max([pos.max(), neg.max()])
        bins = np.linspace(lb, ub, int(np.round(len(pos)**0.5)))
        sns.distplot(neg, kde=False, norm_hist=True, bins=bins, ax=self.ax1)
        sns.distplot(pos, kde=False, norm_hist=True, bins=bins, ax=self.ax1)
        self.ax1.legend(["neg. ({})".format(len(neg)), "pos. ({})".format(len(pos))])
        self.ax1.set_ylabel("density")
        self.ax2.set_visible(False)

    def _con2con_plot(self):
        """ display a simple scatter plot if both target and feature are continuous
        """
        self.ax1.scatter(self.df[self.feature], self.df[self.target], alpha=0.5)
        corrcoef = np.round(self.df[[self.feature, self.target]].corr().values[0,1], 3)
        sns.regplot(x=self.feature, y=self.target, data=self.df, 
                    ax=self.ax1, scatter_kws={'alpha': 0.5},
                    line_kws={"color": "black", "linestyle": "--"})
        self.ax1.text(0.95, 0.95, f"corrcoef: {corrcoef}", horizontalalignment='right', verticalalignment='top', transform=self.ax1.transAxes)
        self.ax1.set_xlabel(self.feature)
        self.ax1.set_ylabel(self.target)
        self.ax2.set_visible(False)

##}