Source code for wiggum.trend_components.statistical

import pandas as pd
import numpy as np
import itertools
import scipy.stats as stats

groupby_name_by_type = {pd.core.groupby.DataFrameGroupBy:lambda df: df.keys,
                                pd.core.frame.DataFrame:lambda df: None}

class CorrelationBase():
    overview_legend = 'binary'

    def is_computable(self,labeled_df=None):
        """
        check if this trend can be computed based on data and metadata available

        Parameters
        ----------
        self : Trend
            a trend object with a set_vars Parameters
        labeled_df : LabeledDataFrame {None} (optional)
            data to use if trend is not already configured


        Returns
        -------
        computable : bool
            True if requirements of get_trends are filled

        See also:
        get_trends() for description of how this trend computes and
        """
        if not( self.set_vars):
            self.get_trend_vars(labeled_df)

        regssion_vars_tuple = type(self.regression_vars[0]) ==tuple
        regression_vars_len = len(self.regression_vars)>2
        vart_test_list = [regssion_vars_tuple or regression_vars_len,
                        bool(self.corrtype)]

        return np.product([vartest for vartest in vart_test_list])

    def compute_correlation_table(self,data_df,trend_col_name):
        '''
        common code for computing correlations for any correlation based trend


        Parameters
        ----------
        data_df : DataFrame or DataFrameGroupBy
            data to compute trends on, may be a whole, unmodified DataFrame or
        a grouped DataFrame as passed by LabeledDataFrame get trend functions
        trend_col_name : {'subgroup_trend','agg_trend'}
            which type of trend is to be computed

        Required properties
        --------------------
        name : string
            used in the trend_type column of result_df and by viz
        regression_vars : list of strings
            variables to compute correlations of
        corrtype : string {'pearson','spearman','kendall'}
            correlation type to be passed to DataFrame.corr(method=corrtype)


        Returns
        -------
        corr_data : list of tuples
            the tuples are of (independednt variable name, dependent variable name,
            correlation, grouping variable)
        '''
        # recover a single list from the independent and dependent vars
        indep, dep = zip(*self.regression_vars)
        corr_var_list = list(set(indep))
        corr_var_list.extend(list(set(dep)))

        corr_var_list = list(set(corr_var_list))

        # get locations of upper right triangle of a correlation matrix for this
        # many values
        num_vars = len(corr_var_list)
        triu_indices_0 = np.triu_indices(num_vars,k=1)


        if num_vars > 0:
            # name of the current trend
            trend_name = '_'.join([self.name , trend_col_name])
            # compute correlations
            corr_mat = data_df[corr_var_list].corr(method=self.corrtype)
            # store the correlation matrix for later use
            self.trend_precompute[trend_name] = corr_mat

            # unpack into a list of tuples
            if type(data_df) is pd.core.groupby.DataFrameGroupBy:
                corr_target_vals = []
                groupby_vars = list(data_df.groups.keys())

                corr_data = [(i,d, corr_mat[i][g][d],g) for (i,d),g in
                                    itertools.product(self.regression_vars,groupby_vars)]

            else:
                # not symmtetric, not groupby
                corr_data = [(i,d, corr_mat[i][d],'') for i,d in self.regression_vars]
        else:
            # no data to computes
            corr_data = [[]]

        # always return this
        return corr_data

    def wrap_reg_df(self, reg_df,groupby_name):
        '''
        add the groupby varaible or drop the subgroup coloumn

        Parameters
        ----------
        reg_df : DataFrame
            dataframe created by wrapping the output of compute_correlation_table
        groupby_name : string or None
            name for the groupby column or None if not a subgroup


        Returns
        -------
        reg_df : DataFrame
            data frame with added splitby column or removed subgroup column
        as applicable and added trend_type column
        '''

        # if groupby add subgroup indicator columns
        if groupby_name:
            #same for all
            reg_df['splitby'] = groupby_name
        else:
            # if not, remove subgoup
            reg_df.drop(columns = 'subgroup',inplace=True)

        # add the trend name everywhere
        reg_df['trend_type'] = self.name

        return reg_df


class CorrelationTrend(CorrelationBase):

    ############################################################################
    # trend computation functions
    ############################################################################

    def get_trends(self,data_df,trend_col_name):
        """
        Compute a trend, its quality and return a partial result_df

        Parameters
        ----------
        data_df : DataFrame or DataFrameGroupBy
            data to compute trends on, may be a whole, unmodified DataFrame or
        a grouped DataFrame as passed by LabeledDataFrame get trend functions
        trend_col_name : {'subgroup_trend','agg_trend'}
            which type of trend is to be computed

        Required properties
        --------------------
        name : string
            used in the trend_type column of result_df and by viz
        regression_vars : list of strings
            variables to compute correlations of
        corrtype : string {'pearson','spearman','kendall'}
            correlation type to be passed to DataFrame.corr(method=corrtype)


        Returns
        -------
        reg_df : DataFrame
            partial result_df, multiple can be merged together to form
            a complete result_df
        """

        # get correlations
        corr_data = self.compute_correlation_table(data_df,trend_col_name)

        # expand to trend and strength
        # strength here is the absolute value of the trend value
        reg_df = pd.DataFrame(data=[[i,d,v,np.abs(v),g] for i,d,v,g in corr_data],
                columns = ['independent','dependent',trend_col_name,
                            trend_col_name+'_strength','subgroup'])


        # this will either be None or the string that is the name, depending
        # on if data_df is a groupby object or not
        groupby_name = groupby_name_by_type[type(data_df)](data_df)
        # finalize the table
        reg_df = self.wrap_reg_df(reg_df,groupby_name)



        return reg_df

    def get_distance(self,row,col_a='subgroup_trend',col_b='agg_trend'):
        """
        distance between the subgroup and aggregate trends for a row of a
        result_df  binary 0 for same sign, 1 for opposite sign

        Parameters
        ----------
        row : pd.Series
            row of a result_df DataFrame

        Returns
        -------
        <>_dist : float
            distance between the subgroup_trend and agg_trend, compatible with
            assignment to a cell of a result_df
        """
        sg_trend = row[col_a]
        ag_trend = row[col_b]

        # if they're the same set to False
        binary_distance  = int(not(np.sign(sg_trend) == np.sign(ag_trend)))
        return binary_distance


[docs]class CorrelationSignTrend(CorrelationBase): ''' trends that are based on a correlation of type that is specified as a property and computes a binary comparison of the signs as a distance '''
[docs] def get_distance(self,row,col_a='subgroup_trend',col_b='agg_trend'): """ distance between the subgroup and aggregate trends for a row of a result_df binary 0 for same sign, 1 for opposite sign Parameters ---------- row : pd.Series row of a result_df DataFrame Returns ------- <>_dist : float distance between the subgroup_trend and agg_trend, compatible with assignment to a cell of a result_df """ sg_trend = row[col_a] ag_trend = row[col_b] # if they're the same set to False binary_distance = int(not(sg_trend == ag_trend)) return binary_distance