Source code for wiggum.trend_components.categorical

import pandas as pd
import numpy as np
import itertools
import scipy.stats as stats

[docs]class StatBinRankTrend(): """ Compute a trend that determines between alphabetically ordered values of a two-valued categorical variable are > or < when ordered by a statistic of another variable quality based on the ratio and the distance is 0/1 loss """ overview_legend = 'binary'
[docs] def is_computable(self,labeled_df=None): """ check if this trend can be computed based on data and metadata available Parameters ---------- self : Trend a trend object with a set_vars Parameters labeled_df : LabeledDataFrame {None} (optional) data to use if trend is not already configured Returns ------- computable : bool True if requirements of get_trends are filled See also: get_trends() for description of how this trend computes and """ if not( self.set_vars): self.get_trend_vars(labeled_df) vart_test_list = [bool(self.my_stat), bool(self.trendgroup), bool(self.target), len(self.var_weight_list)==len(self.target)] return np.product([vartest for vartest in vart_test_list])
[docs] def get_distance(self,row,col_a='subgroup_trend',col_b='agg_trend'): """ 0/1 loss on >< Parameters ---------- row : pd.Series row of a result_df DataFrame. the `agg_trend` and `subgroup_trend` columns must contain lists Returns ------- 0_1_loss : float 0/1 loss distance between the subgroup_trend and agg_trend compatible with assignment to a cell of a result_df """ # if they're the same, int(True) =1, but dist =0 # if they're not, int(False) = 0 bust dist =1 return 1- int(row[col_b] == row[col_a])
def is_SP(self,row,thresh=0,col_a='subgroup_trend',col_b='agg_trend'): return not(row[col_a] == row[col_b])
[docs]class StatRankTrend(): """ Compute a trend that is the ascending ranking of categorical variables, quality based on the trend vs actual kendall tau distance and the distance in subgroup vs aggregtae is 1-tau the distances are a continuous value """ overview_legend = 'continuous'
[docs] def is_computable(self,labeled_df=None): """ check if this trend can be computed based on data and metadata available Parameters ---------- self : Trend a trend object with a set_vars Parameters labeled_df : LabeledDataFrame {None} (optional) data to use if trend is not already configured Returns ------- computable : bool True if requirements of get_trends are filled See also: get_trends() for description of how this trend computes and """ if not( self.set_vars): self.get_trend_vars(labeled_df) vart_test_list = [bool(self.my_stat), bool(self.trendgroup), bool(self.target), len(self.var_weight_list)==len(self.target)] return np.product([vartest for vartest in vart_test_list])
[docs] def get_distance(self,row,col_a='subgroup_trend',col_b='agg_trend'): """ kendalltau distance as a permuation distance Parameters ---------- row : pd.Series row of a result_df DataFrame. the `agg_trend` and `subgroup_trend` columns must contain lists Returns ------- tau_dist : float perumation distance between the subgroup_trend and agg_trend compatible with assignment to a cell of a result_df """ # make a numeric map for all possible values a_vals = list(row[col_a]) b_vals = list(row[col_b]) # set(sum) gives nonrepeating union of lists all_vals = set(a_vals + b_vals) # make numeric dict trend_numeric_map = {val:i for i,val in enumerate(all_vals)} # make numeric lists for each column numeric_a = [trend_numeric_map[val] for val in row[col_a]] numeric_b = [trend_numeric_map[val] for val in row[col_b]] # if not the same length, append to shorter to match # add high numbers to end to minimially impact sort n_a = len(numeric_a) n_b = len(numeric_b) if n_a < n_b: append_nums = list(range(n_a,n_b)) numeric_a.extend(append_nums) if n_a > n_b: append_nums = list(range(n_b,n_a)) numeric_b.extend(append_nums) # compute correlation of prepared numerical lists tau,p = stats.kendalltau(numeric_a,numeric_b) # scale and flip to normalize in [0,1] and round for display tau_dist = np.round(1- (tau+1)/2,4) return tau_dist