Source code for detect_simpsons_paradox.detect_sp
import numpy as np
import pandas as pd
# Function s
def upper_triangle_element(matrix):
"""
extract upper triangle elements without diagonal element
Parameters
-----------
matrix : 2d numpy array
Returns
--------
elements : numpy array
A array has all the values in the upper half of the input matrix
"""
#upper triangle construction
tri_upper = np.triu(matrix, k=1)
num_rows = tri_upper.shape[0]
#upper triangle element extract
elements = tri_upper[np.triu_indices(num_rows,k=1)]
return elements
def upper_triangle_df(matrix):
"""
extract upper triangle elements without diagonal element and store the element's
corresponding rows and columns' index information into a dataframe
Parameters
-----------
matrix : 2d numpy array
Returns
--------
result_df : dataframe
A dataframe stores all the values in the upper half of the input matrix and
their corresponding rows and columns' index information into a dataframe
"""
#upper triangle construction
tri_upper = np.triu(matrix, k=1)
num_rows = tri_upper.shape[0]
#upper triangle element extract
elements = tri_upper[np.triu_indices(num_rows,k=1)]
location_tuple = np.triu_indices(num_rows,k=1)
result_df = pd.DataFrame({'value':elements})
result_df['attr1'] = location_tuple[0]
result_df['attr2'] = location_tuple[1]
return result_df
def isReverse(a, b):
"""
Reversal is the logical opposite of signs matching.
Parameters
-----------
a : number(int or float)
b : number(int or float)
Returns
--------
boolean value : If True turns, a and b have the reverse sign.
If False returns, a and b have the same sign.
"""
return not (np.sign(a) == np.sign(b))
[docs]def detect_simpsons_paradox(latent_df,
continuousAttrs_labels=None,
groupbyAttrs_labels=None ):
"""
A detection function which can detect Simpson Paradox happened in the data's
subgroup.
Parameters
-----------
latent_df : dataframe
data organized in a pandas dataframe containing both categorical
and continuous attributes.
continuousAttrs_labels : list [None]
list of continuous attributes by name in dataframe, if None will be
detected by all float64 type columns in dataframe
groupbyAttrs_labels : list [None]
list of group by attributes by name in dataframe, if None will be
detected by all object and int64 type columns in dataframe
Returns
--------
result_df : dataframe
In the result dataframe, it stores the information of the subgroup
which is detected having Simpson Paradox.
TODO: Clarify the return information
"""
# if not specified, detect continous attributes and categorical attributes
# from dataset
if groupbyAttrs_labels is None:
groupbyAttrs = latent_df.select_dtypes(include=['object','int64'])
groupbyAttrs_labels = list(groupbyAttrs)
if continuousAttrs_labels is None:
continuousAttrs = latent_df.select_dtypes(include=['float64'])
continuousAttrs_labels = list(continuousAttrs)
# Compute correaltion matrix for all of the data, then extract the upper
# triangle of the matrix.
# Generate the correaltion dataframe by correlation values.
all_corr = latent_df[continuousAttrs_labels].corr()
all_corr_df = upper_triangle_df(all_corr)
all_corr_element = all_corr_df['value'].values
# Define an empty dataframe for result
result_df = pd.DataFrame()
# Loop by group-by attributes
for groupbyAttr in groupbyAttrs_labels:
grouped_df_corr = latent_df.groupby(groupbyAttr)[continuousAttrs_labels].corr()
groupby_value = grouped_df_corr.index.get_level_values(groupbyAttr).unique()
# Get subgroup correlation
for subgroup in groupby_value:
subgroup_corr = grouped_df_corr.loc[subgroup]
# Extract subgroup
subgroup_corr_elements = upper_triangle_element(subgroup_corr)
# Compare the signs of each element in subgroup to the correlation for all of the data
# Get the index for reverse element
index_list = [i for i, (a,b) in enumerate(zip(all_corr_element, subgroup_corr_elements)) if isReverse(a, b)]
# Get reverse elements' correlation values
reverse_list = [j for i, j in zip(all_corr_element, subgroup_corr_elements) if isReverse(i, j)]
if reverse_list:
# Retrieve attribute information from all_corr_df
all_corr_info = [all_corr_df.loc[i].values for i in index_list]
temp_df = pd.DataFrame(data=all_corr_info,columns=['allCorr','attr1','attr2'])
# # Convert index from float to int
# temp_df.attr1 = temp_df.attr1.astype(int)
# temp_df.attr2 = temp_df.attr2.astype(int)
print(continuousAttrs_labels)
# Convert indices to attribute names for readabiity
temp_df.attr1 = continuousAttrs_labels[temp_df.attr1]
temp_df.attr2 = continuousAttrs_labels[temp_df.attr2]
temp_df["reverseCorr"] = reverse_list
len_list = len(reverse_list)
# Store group attributes' information
temp_df['groupbyAttr'] = [groupbyAttr for i in range(len_list)]
temp_df['subgroup'] = [subgroup for i in range(len_list)]
result_df = result_df.append(temp_df, ignore_index=True)
return result_df