Source code for tprojection.utils

import pandas as pd
import numpy as np

##{
[docs]def is_continuous(s, thresh): """ Return true if the series is continuous Parameters ---------------- s : pandas Series thresh : float Returns ---------------- Boolean """ try: _ = pd.to_numeric(s) except: return False if s.nunique()/s.count() < thresh: return False else: return True
##} ##{
[docs]def get_encoding(df, target, feature, nb_buckets): """ Encode the feature modalities on a maximum of nb_buckets Parameters ---------------- df : pandas DataFrame target : str feature : str nb_buckets : int Returns ---------------- Dict() """ assert nb_buckets < len(df[feature].unique()) , "the number of encoded modalities shall be lower than the number of unique element in {}".format(feature) assert df[feature].isna().sum() == 0, "feature column shall not contain missing value" dg = df.groupby(feature).agg({target: ["count", "mean"]}) dg.columns = ["count", "mean"] dg["cumratio"] = dg["count"].cumsum()/dg["count"].sum() dg["ratio"] = dg["count"]/dg["count"].sum() # isolate modality with high frequency thresh_freq = 1/nb_buckets/2 high_freq_modalities = list(dg[dg.ratio > thresh_freq].index) nb_high_freq_modalities = len(high_freq_modalities) high_freq_map = {v: v for v in high_freq_modalities} # regroup low frequency modalities low_freq_dg = dg[dg.ratio <= thresh_freq] slicer = np.linspace(0, 1, nb_buckets + 1 - nb_high_freq_modalities) ii = 1 mymap = {"g" + str(ii+1): [] for ii in range(nb_buckets)} for row in low_freq_dg.iterrows(): if row[1]["cumratio"] > slicer[ii]: ii+=1 mymap["g" + str(ii)].append(row[0]) low_freq_map = {moda: k for k, vals in mymap.items() for moda in vals} return dict(high_freq_map, **low_freq_map)
##}