##{
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import tprojection.utils as ut
##}
__all__ = ["Tprojection"]
font = {'size' : 12}
matplotlib.rc('font', **font)
##{
[docs]class Tprojection:
"""
this class allows to study the relation between the target and a single feature, with the specificity to display a chart type adapted
to the type of the input variables (categorical or continuous)
Parameters
----------
df: pandas DataFrame
target: string
feature: string
target_type: string
can take the values "categorical" or "continuous"
feature_type: string
can take the values "categorical" or "continuous"
target_modality: string
will be used for multiclass problem (not implemented yet)
nb_buckets: int (0)
if > 0, encode feature on nb_buckets dummy modalities if the cardinality is to high
n_estimators: int (1)
if > 1, use boostrapping to evaluate estimator variance (only relevant for categorical target and features)
"""
def __init__(self, df, target, feature,
target_type="", feature_type="",
target_modality="",
nb_buckets=0, n_estimators=1,
continuous_threshold=0.05):
self.df = df.copy()
self.target = target
self.feature = feature
self.target_type = target_type
self.feature_type = feature_type
self.target_modality = target_modality
self.nb_buckets = nb_buckets
self.continuous_threshold = continuous_threshold
self.n_estimators = n_estimators
self._infer_type()
self._check_dtype_consistency()
self._infer_target_modality()
self._sanitize_target()
def plot(self):
self.fig, self.ax1 = plt.subplots()
self.ax2 = self.ax1.twinx()
if self.feature_type == "categorical":
self._cat2all_prep()
show_boxplot = np.where(self.target_type == "categorical", 0, 1)
self._cat2all_plot(show_boxplot)
else:
if self.target_type == "categorical":
self._con2bin_plot()
else:
self._con2con_plot()
plt.tight_layout()
plt.show(block=False)
def _infer_type(self):
if self.target_type == "":
self.target_type = np.where(ut.is_continuous(self.df[self.target], self.continuous_threshold), "continuous", "categorical")
if self.feature_type == "":
self.feature_type = np.where(ut.is_continuous(self.df[self.feature], self.continuous_threshold), "continuous", "categorical")
def _check_dtype_consistency(self):
def set_dtype(var):
mydtype = str if getattr(self, var + '_type') == 'categorical' else float
self.df[getattr(self, var)] = self.df[getattr(self, var)].astype(mydtype).copy()
set_dtype("feature")
set_dtype("target")
def _infer_target_modality(self):
if self.target_type == "categorical" and self.target_modality == "":
self.target_modality = self.df[self.target].value_counts().sort_values().index[0]
def _sanitize_target(self):
if self.target_type == 'categorical':
self.df["target_san"] = np.where(self.df[self.target] == self.target_modality, 1, 0)
else:
self.df['target_san'] = self.df[self.target]
def _bootstrap(self, feature):
dg = pd.DataFrame()
replace = np.where(self.n_estimators > 1, True, False)
count = self.df.groupby(feature)[self.target].count()
for ii in range(self.n_estimators):
dtmp = self.df.sample(frac=1, replace=replace)
dg = dg.append(dtmp.groupby(feature)["target_san"].mean())
dboot= dg.aggregate(["mean", "min", "max"], axis=0)
dboot.loc["count",:] = count.values
return dboot.T
def _cat2all_prep(self):
if self.nb_buckets:
self.encoding = ut.get_encoding(self.df, 'target_san', self.feature, self.nb_buckets)
else:
self.encoding = {v: v for v in self.df[self.feature].unique()}
self.df[self.feature + "_encoded"] = self.df[self.feature].map(self.encoding)
dg = self._bootstrap(self.feature + "_encoded")
dg['baseline'] = self.df["target_san"].mean()
dg.sort_values(by="count", ascending=False, inplace=True)
segment = self.feature + "_encoded"
self.dg = dg
self.segment = segment
def _cat2all_plot(self, show_boxplot=False):
self.dg["count"].plot(kind="bar", color="blue", ax=self.ax1, alpha=0.5)
xlabels = self.ax1.get_xticklabels()
self.ax1.set_xticklabels(xlabels, rotation=45)
self.dg["mean"].plot(color="red", marker="o", markersize=5, linewidth=2, ax=self.ax2)
self.dg["min"].plot(color="red", linewidth=1, linestyle="--", ax=self.ax2)
self.dg["max"].plot(color="red", linewidth=1, linestyle="--", ax=self.ax2)
self.ax2.fill_between(self.ax2.get_xticks(), self.dg["min"], self.dg["max"], facecolor="red", alpha=0.2)
self.dg["baseline"].plot(color="black", linestyle="--", linewidth=2, ax=self.ax2)
if show_boxplot:
sns.boxplot(x=self.segment, y=self.target, data=self.df, order=self.dg.index,
color="white", boxprops=dict(alpha=0.5))
self.ax1.set_xlim([-0.6, len(self.dg)-0.5])
self.ax1.set_xlabel(self.feature)
self.ax2.set_ylabel(self.target)
self.ax1.set_ylabel("count")
def _con2bin_plot(self):
"""
plot two histograms, one for each class of the target if the target is
binary and the feature continuous
"""
pos = self.df.query("target_san == 1")[self.feature]
neg = self.df.query("target_san == 0")[self.feature]
lb = np.min([pos.min(), neg.min()])
ub = np.max([pos.max(), neg.max()])
bins = np.linspace(lb, ub, int(np.round(len(pos)**0.5)))
sns.distplot(neg, kde=False, norm_hist=True, bins=bins, ax=self.ax1)
sns.distplot(pos, kde=False, norm_hist=True, bins=bins, ax=self.ax1)
self.ax1.legend(["neg. ({})".format(len(neg)), "pos. ({})".format(len(pos))])
self.ax1.set_ylabel("density")
self.ax2.set_visible(False)
def _con2con_plot(self):
""" display a simple scatter plot if both target and feature are continuous
"""
self.ax1.scatter(self.df[self.feature], self.df[self.target], alpha=0.5)
corrcoef = np.round(self.df[[self.feature, self.target]].corr().values[0,1], 3)
sns.regplot(x=self.feature, y=self.target, data=self.df,
ax=self.ax1, scatter_kws={'alpha': 0.5},
line_kws={"color": "black", "linestyle": "--"})
self.ax1.text(0.95, 0.95, f"corrcoef: {corrcoef}", horizontalalignment='right', verticalalignment='top', transform=self.ax1.transAxes)
self.ax1.set_xlabel(self.feature)
self.ax1.set_ylabel(self.target)
self.ax2.set_visible(False)
##}