Source code for detect_simpsons_paradox.detect_sp

import numpy as np
import pandas as pd

# Function s
def upper_triangle_element(matrix):
    """
    extract upper triangle elements without diagonal element

    Parameters
    -----------
    matrix : 2d numpy array

    Returns
    --------
    elements : numpy array
               A array has all the values in the upper half of the input matrix

    """
    #upper triangle construction
    tri_upper = np.triu(matrix, k=1)
    num_rows = tri_upper.shape[0]

    #upper triangle element extract
    elements = tri_upper[np.triu_indices(num_rows,k=1)]

    return elements


def upper_triangle_df(matrix):
    """
    extract upper triangle elements without diagonal element and store the element's
    corresponding rows and columns' index information into a dataframe

    Parameters
    -----------
    matrix : 2d numpy array

    Returns
    --------
    result_df : dataframe
               A dataframe stores all the values in the upper half of the input matrix and
               their corresponding rows and columns' index information into a dataframe
    """
    #upper triangle construction
    tri_upper = np.triu(matrix, k=1)
    num_rows = tri_upper.shape[0]

    #upper triangle element extract
    elements = tri_upper[np.triu_indices(num_rows,k=1)]
    location_tuple = np.triu_indices(num_rows,k=1)
    result_df = pd.DataFrame({'value':elements})
    result_df['attr1'] = location_tuple[0]
    result_df['attr2'] = location_tuple[1]

    return result_df


def isReverse(a, b):
    """
    Reversal is the logical opposite of signs matching.

    Parameters
    -----------
    a : number(int or float)
    b : number(int or float)

    Returns
    --------
    boolean value : If True turns, a and b have the reverse sign.
                    If False returns, a and b have the same sign.
    """

    return not (np.sign(a) == np.sign(b))

[docs]def detect_simpsons_paradox(latent_df,
                            continuousAttrs_labels=None,
                            groupbyAttrs_labels=None ):
    """
    A detection function which can detect Simpson Paradox happened in the data's
    subgroup.

    Parameters
    -----------
    latent_df : dataframe
        data organized in a pandas dataframe containing both categorical
        and continuous attributes.
    continuousAttrs_labels : list [None]
        list of continuous attributes by name in dataframe, if None will be
        detected by all float64 type columns in dataframe
    groupbyAttrs_labels  : list [None]
        list of group by attributes by name in dataframe, if None will be
        detected by all object and int64 type columns in dataframe

    Returns
    --------
    result_df : dataframe
                In the result dataframe, it stores the information of the subgroup
                which is detected having Simpson Paradox.
                TODO: Clarify the return information

    """
    # if not specified, detect continous attributes and categorical attributes
    # from dataset
    if groupbyAttrs_labels is None:
        groupbyAttrs = latent_df.select_dtypes(include=['object','int64'])
        groupbyAttrs_labels = list(groupbyAttrs)

    if continuousAttrs_labels is None:
        continuousAttrs = latent_df.select_dtypes(include=['float64'])
        continuousAttrs_labels = list(continuousAttrs)


    # Compute correaltion matrix for all of the data, then extract the upper
    # triangle of the matrix.
    # Generate the correaltion dataframe by correlation values.
    all_corr = latent_df[continuousAttrs_labels].corr()
    all_corr_df = upper_triangle_df(all_corr)
    all_corr_element = all_corr_df['value'].values

    # Define an empty dataframe for result
    result_df = pd.DataFrame()

    # Loop by group-by attributes
    for groupbyAttr in groupbyAttrs_labels:
        grouped_df_corr = latent_df.groupby(groupbyAttr)[continuousAttrs_labels].corr()
        groupby_value = grouped_df_corr.index.get_level_values(groupbyAttr).unique()

        # Get subgroup correlation
        for subgroup in groupby_value:
            subgroup_corr = grouped_df_corr.loc[subgroup]

            # Extract subgroup
            subgroup_corr_elements = upper_triangle_element(subgroup_corr)

            # Compare the signs of each element in subgroup to the correlation for all of the data
            # Get the index for reverse element
            index_list = [i for i, (a,b) in enumerate(zip(all_corr_element, subgroup_corr_elements)) if isReverse(a, b)]

            # Get reverse elements' correlation values
            reverse_list = [j for i, j in zip(all_corr_element, subgroup_corr_elements) if isReverse(i, j)]

            if reverse_list:
                # Retrieve attribute information from all_corr_df
                all_corr_info = [all_corr_df.loc[i].values for i in index_list]
                temp_df = pd.DataFrame(data=all_corr_info,columns=['allCorr','attr1','attr2'])

                # # Convert index from float to int
                # temp_df.attr1 = temp_df.attr1.astype(int)
                # temp_df.attr2 = temp_df.attr2.astype(int)
                print(continuousAttrs_labels)
                # Convert indices to attribute names for readabiity
                temp_df.attr1 = continuousAttrs_labels[temp_df.attr1]
                temp_df.attr2 = continuousAttrs_labels[temp_df.attr2]

                temp_df["reverseCorr"] = reverse_list
                len_list = len(reverse_list)
                # Store group attributes' information
                temp_df['groupbyAttr'] = [groupbyAttr for i in range(len_list)]
                temp_df['subgroup'] = [subgroup for i in range(len_list)]
                result_df = result_df.append(temp_df, ignore_index=True)

    return result_df
Table Of Contents

Related Topics

Source code for detect_simpsons_paradox.detect_sp