Source code for tprojection.utils

import pandas as pd
import numpy as np

##{
[docs]def is_continuous(s, thresh):
    """
    Return true if the series is continuous

    Parameters
    ----------------

    s : pandas Series
    thresh : float

    Returns
    ----------------

    Boolean

    """
    try:
       _ = pd.to_numeric(s)
    except:
        return False
    if s.nunique()/s.count() < thresh:
        return False
    else:
        return True

    ##}

##{     
[docs]def get_encoding(df, target, feature, nb_buckets):
    """
    Encode the feature modalities on a maximum of nb_buckets 

    Parameters
    ----------------

    df : pandas DataFrame
    target : str
    feature : str
    nb_buckets : int

    Returns
    ----------------

    Dict()
    """

    assert nb_buckets < len(df[feature].unique()) , "the number of encoded modalities shall be lower than the number of unique element in {}".format(feature)
    assert df[feature].isna().sum() == 0, "feature column shall not contain missing value"
    dg = df.groupby(feature).agg({target: ["count", "mean"]})
    dg.columns = ["count", "mean"]
    dg["cumratio"] = dg["count"].cumsum()/dg["count"].sum()
    dg["ratio"] = dg["count"]/dg["count"].sum()

    # isolate modality with high frequency
    thresh_freq = 1/nb_buckets/2
    high_freq_modalities = list(dg[dg.ratio > thresh_freq].index)
    nb_high_freq_modalities = len(high_freq_modalities)
    high_freq_map = {v: v for v in high_freq_modalities}

    # regroup low frequency modalities
    low_freq_dg =  dg[dg.ratio <= thresh_freq]
    slicer = np.linspace(0, 1, nb_buckets + 1 - nb_high_freq_modalities)
    ii = 1
    mymap = {"g" + str(ii+1): [] for ii in range(nb_buckets)}
    for row in low_freq_dg.iterrows():
        if row[1]["cumratio"] > slicer[ii]:
            ii+=1
        mymap["g" + str(ii)].append(row[0])
    low_freq_map = {moda: k for k, vals in mymap.items() for moda in vals}

    return dict(high_freq_map, **low_freq_map)
##}