Source code for wiggum.trend_components.base_getvars

import pandas as pd
import numpy as np
import itertools
from io import StringIO

[docs]class Trend(): """ baseclass for abstraction and building trend objects. All trend objects must inherit this class in order to have a constructor (__init__). This may be overloaded to define a different constructor. Parameters ---------- self labeled_df : LabeledDataFrame or None if passed, get_trend_vars is called on initialization using labeled_df as the target dataset to compute trends on """ def __init__(self,labeled_df = None): self.trend_precompute = {} self.preaugment = None # initialize this to False, it's changed by the the get_trend_vars # functions and then used to avoid reconstructing var lists self.set_vars = False if not(labeled_df== None): self.get_trend_vars(labeled_df)
[docs] def get_trend_value_type(self): ''' return the type that the trend values for this trend type should be ''' return self.trend_value_type
[docs] def is_SP(self,row,thresh): """ default is if it's above a threshold, operates rowwise and can be applied to a DataFrame with the apply method Parameters ----------- row : pd.series row of a result df to apply the threshold to thresh : float scalar threshold to compare the distance to Returns ------- boolean value if the distance is over the threshold """ return row['distance'] > thresh
[docs] def load(self,content_dict): ''' load a trend from a dictionary of the content Parameters ---------- content_dict : Dictionary the dictionary that results from saving a trend object via the trend.__dict__ output Returns ------- self : a trend object with all of the parameters set according to the dictionary ''' # take the dictionary and load it to the properties of the object self.__dict__.update(content_dict) # reformat csv-ified tables to dataframes # iterate over the key, value pairs in the precompute Dictionary # keep the keys and convert the strings to a buffer then read them # into a dataframe self.trend_precompute = {st:pd.read_csv(StringIO(pc)) for st,pc in self.trend_precompute.items()} return self
################################################################################ # Components ################################################################################ # these parts can be mixed together to create full final classes that are used # for importing and only those are revealed in class Regression(): ''' common functions for all regression, ''' symmetric_vars = False trend_value_type = float detail_view = 'scatter' def set_weights_regression(self,labeled_df,i_type,d_type): ''' ''' indep_vars = labeled_df.get_vars_per_roletype('independent', i_type) dep_vars = labeled_df.get_vars_per_roletype('dependent', d_type) # if the lists are the same, then symmetric dep_indep = [d in indep_vars for d in dep_vars] # product of bools is true iff all are true if np.product(dep_indep): self.symmetric_vars = True # use iterator to compute pairs reg_var_iterator = itertools.product(indep_vars,dep_vars) # transform to list of tuples so that is computable works self.regression_vars = [(i,d) for i,d in reg_var_iterator if not(i==d)] # get the weights for the final set of regression vars & cast to tuple weights = lambda vars: tuple(labeled_df.get_weightcol_per_var(vars)) self.var_weight_list = [weights([i,d]) for i,d in self.regression_vars] return True
[docs]class OrdinalRegression(Regression): """ regression compatible varTypeMixin, sets list formatted regression_vars and symmetric_vars = True """
[docs] def get_trend_vars(self,labeled_df): """ set regression_vars for regression of pairs of ordinal variables, by assigning regression_vars as an instance property Parameters ----------- labeled_df : LabeledDataFrame object to parse for variable types Returns -------- regression_vars : list of strings variables list of all ordinal trend variables var_weight_list : list of strings list of variables to be used as weights for each regression_vars """ self.set_vars = self.set_weights_regression(labeled_df,'ordinal','ordinal') return self.regression_vars
[docs]class ContinuousOrdinalRegression(Regression): """ regression compatible varTypeMixin, sets list formatted regression_vars and uses continuous dependent vars and ordinal independent """
[docs] def get_trend_vars(self,labeled_df): """ set regression_vars for regression of pairs of ordinal and continuous trend variables, by assigning regression_vars as an instance property Parameters ----------- labeled_df : LabeledDataFrame object to parse for variable types Returns -------- regression_vars : list of strings variables list of all trend variables with type set to ordinal or continuous var_weight_list : list of strings list of variables to be used as weights for each regression_vars """ # use common regression function to set, returns true if it works self.set_vars = self.set_weights_regression(labeled_df, ['ordinal','continuous'],['ordinal','continuous']) return self.regression_vars
[docs]class ContinuousRegression(Regression): """ regression compatible varTypeMixin, for working with continuous variables sets list formatted regression_vars and symmetric_vars = True """
[docs] def get_trend_vars(self,labeled_df): """ set regression_vars for regression of pairs of continuous trend variables, by assigning regression_vars as an instance property Parameters ----------- labeled_df : LabeledDataFrame object to parse for variable types Returns -------- regression_vars : list of strings variables list of all trend variables with type set to ordinal or continuous var_weight_list : list of strings list of variables to be used as weights for each regression_vars """ # use common regression function to set, returns true if it works self.set_vars = self.set_weights_regression(labeled_df, 'continuous','continuous') return self.regression_vars
def w_median(df,mcol,wcol): """ compute the median or median with replication according to weights, gives a confidence interval specified by the middle 50% compatible with DataFrame.apply() and get_trends functions in wiggum.trend_components.categorical Parameters ---------- df : DataFrame or DataFrameGroupBy passed as the source of apply, the data to extract columns from for computing a weighted average mcol : string name of column in df to take the median of wcol : string name of column in df to use for weighting Returns ------- stat_data : pandas Series with 'stat', 'max', 'min' values defining the statistic and a confidence interval stat : float median of df[avcol] weighted row wise by df[wcol] max : float mean + std to be used for upper limit of confidence interval min : float mean - std """ if pd.isna(wcol): wmed ,upper,lower = np.quantile(df[mcol],[.5,.25,.75]) count = n_df[avcol].count() else: reps = [int(n) for n in df[wcol].values] reps_mcol = np.repeat(df[mcol].values,reps) wmed,upper,lower =np.quantile( reps_mcol,[.5,.25,.75]) count = n_df[wcol].sum() return pd.Series([wmed ,upper,lower,count], index=['stat','max','min','count'])
[docs]def w_avg(df,avcol,wcol): """ commpute a weighted average and use the std to define confidence interval compatible with DataFrame.apply() and get_trends functions in wiggum.trend_components.categorical Parameters ---------- df : DataFrame or DataFrameGroupBy passed as the source of apply, the data to extract columns from for computing a weighted average avcol : string name of column in df to take the average of wcol : string name of column in df to use for weighting Returns ------- stat_data : pandas Series with 'stat', 'max', 'min' values defining the statistic and a confidence interval and 'count' defining the power of the computation stat : float mean of df[avcol] weighted row wise by df[wcol] max : float mean + std to be used for upper limit of confidence interval min : float mean - std count : int sum wcol """ n_df = df.dropna(axis=0,subset=[avcol]) if len(n_df): if pd.isna(wcol): wmean = n_df[avcol].mean() std = n_df[avcol].std() count = n_df[avcol].count() else: wmean = np.average(n_df[avcol],weights =n_df[wcol]) # np.sum(df[avcol]*df[wcol])/np.sum(df[wcol]) var = np.average((n_df[avcol]-wmean)**2, weights=n_df[wcol]) std = np.sqrt(var) count = n_df[wcol].sum() else: wmean =0.0 std = 0.0 count =0 return pd.Series([wmean ,wmean+std,wmean-std,count], index=['stat','max','min','count'])
[docs]class BinaryWeightedRank(): """ statRank compatible varTypeMixin, for computing means of only binary valued variables sets stat to wg.trend_components.w_avg """ trend_value_type = str detail_view = 'rank'
[docs] def get_trend_vars(self,labeled_df): """ set target, trendgroup, and var_weight_list for computing rank trends Parameters ----------- labeled_df : LabeledDataFrame object to parse for variable types Returns -------- regression_vars : list of strings variables list of all trend variables with type set to ordinal or continuous """ self.target = labeled_df.get_vars_per_roletype('dependent','binary') all_cat = labeled_df.get_vars_per_roletype('independent','categorical') self.trendgroup = [var for var in all_cat if len(pd.unique(labeled_df.df[var])) == 2] self.var_weight_list = labeled_df.get_weightcol_per_var(self.target) self.set_vars = True return (self.target,self.trendgroup)
[docs]class WeightedRank(): """ common parts for all continuous variable trends """ trend_value_type = str detail_view = 'rank'
[docs] def get_trend_vars(self,labeled_df): """ """ # maybe not counts self.target = labeled_df.get_vars_per_roletype('dependent',['binary','continuous']) self.trendgroup = labeled_df.get_vars_per_roletype('independent','categorical') self.var_weight_list = labeled_df.get_weightcol_per_var(self.target) self.set_vars = True return self.target, self.trendgroup
[docs]class PredictionClass(): """ for binary classification performance stats """ trend_value_type = float def get_trend_vars(self,labeled_df): self.groundtruth = labeled_df.get_vars_per_role('groundtruth') self.prediction = labeled_df.get_vars_per_role('prediction') self.preaugment = 'confusion' self.set_vars = True return self.groundtruth, self.prediction