import numpy as np
import pandas as pd
from random import randint
import random
from random import gauss
import math
import scipy.stats as stats
import string
def linear_sp(N):
"""
generatate synthetic data for SP in linear trends by generating
linear data with offsets that induce SP
"""
[docs]def simple_regression_sp(N, mu,cov):
"""
generate synthetic data for simplest case of group-wise SP of the
regression type, generates data from $k$k clusters with centers
mu each with covariance cov
mu and cov must induce SP, this does not make SP happen
adds 1 noisy dimensions
Parameters
-----------
N : scalar integer
number of samples total to draw
mu : k cluster centers in d dimensions
locations of the clusters
cov : d_1 xd_1 covariance
shared covariance of all subgroup clusters
"""
# number rof clusters
k = len(mu)
# dimensions
d = len(mu[0])
#generate var names
var_names = ['x'+str(i+1) for i in range(d)]
# sample from clusters (should accept wights and allow this to vary ie class imbalance)
z = np.random.randint(0,k,N)
x = np.asarray([np.random.multivariate_normal(mu[z_i],cov) for z_i in z])
# make a dataframe
latent_df = pd.DataFrame(data=x,
columns = var_names)
# add a noise column
x3 = np.random.normal(0, 10, N)
latent_df['x_n'] = x3
# code z as color and add that as a column to the dataframe
color_z = {0:'r', 1:'b'}
latent_df['color'] = [color_z[z_i] for z_i in z]
return latent_df
[docs]def noise_regression_sp(N, mu,cov,d_noise):
"""
generate synthetic data for simplest case of group-wise SP of the
regression type, generates data from $k$k clusters with centers
mu each with covariance cov
mu and cov must induce SP, this does not make SP happen
adds d_noise noisy dimensions
Parameters
-----------
N : scalar integer
number of samples total to draw
mu : k cluster centers in d dimensions
locations of the clusters
cov : d_1 xd_1 covariance
shared covariance of all subgroup clusters
"""
# number rof clusters
k = len(mu)
# dimensions
d = len(mu[0]) + d_noise
#generate var names
var_names = ['x'+str(i+1) for i in range(d)]
# add noise dimensions to mu and cov
mu = np.append(mu,np.zeros([k,d_noise]),axis = 1)
cov = np.vstack([np.hstack([cov,np.zeros([d_noise,d_noise])]),
np.hstack([np.zeros([d_noise,d_noise]),np.eye(d_noise)])])
# sample from clusters (should accept wights and allow this to vary ie class imbalance)
z = np.random.randint(0,k,N)
x = np.asarray([np.random.multivariate_normal(mu[z_i],cov) for z_i in z])
# make a dataframe
latent_df = pd.DataFrame(data=x,
columns = var_names)
# add a noise column
x3 = np.random.normal(0, 10, N)
latent_df['x_n'] = x3
# code z as color and add that as a column to the dataframe
color_z = {0:'r', 1:'b'}
latent_df['color'] = [color_z[z_i] for z_i in z]
return latent_df
return latent_df
def random_with_N_digits(n):
range_start = 10**(n-1)
range_end = (10**n)-1
return randint(range_start, range_end)
[docs]def generateDataset(N, numClu, numExtra):
"""
generate synthetic dataset for time experiments
Parameters
--------------
N : scalar
total samples to draw
numClu: number of clusters
numberOfExtraColumn: number of extra categorical columns and continuous columns
"""
# generate numClu mu_x and mu_y for sampling data from numClu clusters
# create first two mu
mu = np.asarray([[1,1],[5,5]])
# generate rest of the mu
for i in range(numClu - 2):
mu_x = random_with_N_digits(2);
variance = 1000
mu_y = gauss(mu_x, math.sqrt(variance))
mu_new = np.asarray([mu_x,mu_y])
mu = np.append(mu,[mu_new],axis=0)
cov = [[.6,-1],[0,.6]]
# generate the data set
latent_df = mixed_regression_sp_extra(N,mu,cov,[.7,.3], numExtra)
return latent_df
def mixed_regression_sp(N, mu,cov,p):
"""
generate synthetic data for simplest case of group-wise SP of the regression type
mu and cov must induce SP, this does not make SP happen
adds 1 noisy dimensions and an interacting char attribute
Parameters
-----------
N : scalar integer
number of samples total to draw
mu : k cluster centers in d dimensions
locations of the clusters
cov : d_1 xd_1 covariance
shared covariance of all subgroup clusters
"""
k = len(mu)
# sample from clusters (should accept wights and allow this to vary ie class imbalance)
z = np.random.randint(0,k,N)
x = np.asarray([np.random.multivariate_normal(mu[z_i],cov) for z_i in z])
# make a dataframe
latent_df = pd.DataFrame(data=x,
columns = ['x1', 'x2'])
# add a noise column
x3 = np.random.normal(0, 10, N)
latent_df['x3'] = x3
# code z as color and add that as a column to the dataframe
color_z = {0:'r', 1:'b'}
latent_df['color'] = [color_z[z_i] for z_i in z]
# add a sub group, character that interacts with color based on p
char_zy = {0: {0:'x', 1:'o'}, 1:{0:'o', 1:'x'}}
y = np.random.choice([0,1],N,p=p)
latent_df['y'] = y
latent_df['char'] = [char_zy[zi][yi] for zi,yi in zip(z,y)]
return latent_df
"""
# for making more complex interactions, should be its own generator will use abo
"""
def regression_corr(N, mu,cov):
"""
generate synthetic data for simplest case of group-wise SP of the regression type
mu and cov must induce SP, this does not make SP happen
adds 1 noisy dimensions
Parameters
-----------
N : scalar integer
number of samples total to draw
mu : k cluster centers in d dimensions
locations of the clusters
cov : d_1 xd_1 covariance
shared covariance of all subgroup clusters
"""
k = len(mu)
# sample from clusters (should accept wights and allow this to vary ie class imbalance)
z = np.random.randint(0,k,N)
x = np.asarray([np.random.multivariate_normal(mu[z_i],cov) for z_i in z])
# make a dataframe
latent_df = pd.DataFrame(data=x,
columns = ['x1', 'x2'])
# add a noise column
x3 = np.random.normal(0, 10, N)
latent_df['x3'] = x3
# code z as color and add that as a column to the dataframe
color_z = {0:'r', 1:'b'}
latent_df['color'] = [color_z[z_i] for z_i in z]
return latent_df
def means_with_spread(mu_mu,cov,k):
"""
sample k means from a gaussian distribution and downsample with a SE kernel
for a point process-like effect
Parameters
-----------
mu_mu : 2vector
center of means
cov : 2x2
covariance matrix for means
k : scalar
number of means to return
Returns
-------
mu_sort : vector [2,k]
means sorted by distance
"""
# define a sampling function so that we can sample jointly instead of
next_sample = lambda: np.random.multivariate_normal(mu_mu, cov)
# we'll use a gaussian kernel around each to filter
# only the closest point matters
# scale here probably should be set to help provide guarantees
dist = lambda mu_c,x: stats.norm.pdf(min(np.sum(np.square(mu_c -x),axis=1)))
# keep the first one
mu = [next_sample()]
p_dist = [1]
while len(mu) <= k:
m = next_sample()
p_keep = 1- dist(mu,m)
if p_keep > .99:
mu.append(m)
p_dist.append(p_keep)
mu = np.asarray(mu)
# sort by distance
mu_sort, p_sort = zip(*sorted(zip(mu,p_dist),
key = lambda x: x[1], reverse =True))
return mu_sort
[docs]def geometric_2d_gmm_sp(r_clusters,cluster_size,cluster_spread,p_sp_clusters,
domain_range,k,N,p_clusters=None):
"""
Sample from a gaussian mixture model with Simpson's Paradox and spread means
return data in a data fram
r_clusters : scalar [0,1]
correlation coefficient of clusters
cluster_size : 2 vector
variance in each direction of each cluster
cluster_spread : scalar [0,1]
pearson correlation of means
p_sp_clusters : scalar in [0,1]
portion of clusters with SP
p_clusters : vector in [0,1)^k, optional
probabilty of membership of a sample in each cluster (controls relative
size of clusters) default is [1.0/k]*k for uniform
domain_range : [xmin, xmax, ymin, ymax]
planned region for points to be in, means will be in middle 80%
k : integer
number of clusters
N : scalar
number of points
"""
# if not defined, set uniform cluster probaiblity
if p_clusters is None:
p_clusters = [1.0/k]*k
# sample the data
x, z = data_only_geometric_2d_gmm(r_clusters,cluster_size,cluster_spread,
p_sp_clusters,
domain_range,k,N,p_clusters)
# make a dataframe
latent_df = pd.DataFrame(data=x,
columns = ['x1', 'x2'])
# code cluster as color and add it a column to the dataframe
latent_df['color'] = z
return latent_df
[docs]def geometric_indep_views_gmm_sp(d,r_clusters,cluster_size,cluster_spread,p_sp_clusters,
domain_range,k,N,p_clusters=None):
"""
Sample from a gaussian mixture model with Simpson's Paradox and spread means
return data in a data fram
d : integer
number of independent views, groups of 3 columns with sp
r_clusters : scalar [0,1] or list of d
correlation coefficient of clusters
cluster_size : 2 vector or list of d
variance in each direction of each cluster
cluster_spread : scalar [0,1] list of d
pearson correlation of means
p_sp_clusters : scalar in [0,1] list of d
portion of clusters with SP
p_clusters : vector in [0,1)^k, optional or list of d vectors
probabilty of membership of a sample in each cluster (controls relative
size of clusters) default is [1.0/k]*k for uniform
domain_range : [xmin, xmax, ymin, ymax] list of d
planned region for points to be in, means will be in middle 80%
k : integer or list of d
number of clusters
N : scalar
number of points, shared across all views
"""
# if not defined, set uniform cluster probaiblity
if p_clusters is None:
p_clusters = [1.0/k]*k
# make inputs lists if not
sclar_to_list = lambda x: [x]*d # if float, make d list by repeats
if type(r_clusters) in [float, int]:
r_clusters = sclar_to_list(r_clusters)
if type(cluster_spread) in [float, int]:
cluster_spread = sclar_to_list(cluster_spread)
if type(p_sp_clusters) in [float, int]:
p_sp_clusters = sclar_to_list(p_sp_clusters)
if type(k) is int:
k = sclar_to_list(k)
if type(p_clusters[0]) in [float, int]:
p_clusters = sclar_to_list(p_clusters)
if type(cluster_size[0]) in [float, int]:
cluster_size = sclar_to_list(cluster_size)
if type(domain_range[0]) in [float, int]:
domain_range = sclar_to_list(domain_range)
x = []
z = []
for r,c_std,c_sp,p_sp, d_r,k,rho in zip(r_clusters,cluster_size,cluster_spread,p_sp_clusters,
domain_range,k,p_clusters):
# sample the data
x_tmp, z_tmp = data_only_geometric_2d_gmm(r,c_std,c_sp,p_sp, d_r,k,N,rho)
x.append(x_tmp)
z.append(z_tmp)
col_names = ['x'+ str(i+1) for i in range(d*2)]
# make a dataframe
x = np.append(*x,axis=1)
latent_df = pd.DataFrame(data=x,
columns = col_names )
#cluster naming will be name the columns: A, B, ...
# valuses will be A1, A2, ..., Ak...
z_names = list(string.ascii_uppercase[:d])
# code cluster as and add it a column to the dataframe
for z_i,name in zip(z,z_names):
latent_df[name] = [name + str(z_ii) for z_ii in z_i]
return latent_df
[docs]def data_only_geometric_2d_gmm(r_clusters,cluster_size,cluster_spread,p_sp_clusters,
domain_range,k,N,p_clusters):
"""
private, sampler only, returns raw variables, utily for sharing in other
samplers
Sample from a gaussian mixture model with Simpson's Paradox and spread means
r_clusters : scalar [0,1]
correlation coefficient of clusters
cluster_size : 2 vector
variance in each direction of each cluster
cluster_spread : scalar [0,1]
pearson correlation of means
p_sp_clusters : scalar in [0,1]
portion of clusters with SP
p_clusters : vector in [0,1)^k, optional
probabilty of membership of a sample in each cluster (controls relative
size of clusters) default is [1.0/k]*k for uniform
domain_range : [xmin, xmax, ymin, ymax]
planned region for points to be in, means will be in middle 80%
k : integer
number of clusters
N : scalar
number of points
"""
# define distribution for means, using the range provided
mu_mu = [np.mean(domain_range[:2]),np.mean(domain_range[2:])]
# first set correlation mat for means
mu_sign = - np.sign(r_clusters)
corr = [[1, mu_sign*cluster_spread],[mu_sign*cluster_spread,1]]
# use a trimmed range to comput std
mu_trim = .2
mu_transform = np.repeat(np.diff(domain_range)[[0,2]]*(mu_trim),2)
mu_transform[[1,3]] = mu_transform[[1,3]]*-1 # sign flip every other
mu_domain = [d + m_t for d, m_t in zip(domain_range,mu_transform)]
d = np.sqrt(np.diag(np.diff(mu_domain)[[0,2]]))
# construct covariance from correlation
mu_cov = np.dot(d,corr).dot(d)
# sample means
mu = means_with_spread(mu_mu,mu_cov,k)
# create cluster covariances for SP and not SP
cluster_std = np.diag(np.sqrt(cluster_size))
cluster_corr_sp = np.asarray([[1,r_clusters],[r_clusters,1]]) # correlation with sp
cluster_cov_sp = np.dot(cluster_std,cluster_corr_sp).dot(cluster_std) #cov with sp
cluster_corr = np.asarray([[1,-r_clusters],[-r_clusters,1]]) #correlation without sp
cluster_cov = np.dot(cluster_std,cluster_corr).dot(cluster_std) #cov wihtout sp
cluster_covs = [cluster_corr_sp, cluster_corr]
# sample the[0,1] k times to assign each cluster to SP or not
c_sp = np.random.choice(2,k,p=[p_sp_clusters,1-p_sp_clusters])
# sample from a GMM
z = np.random.choice(k,N,p_clusters)
# sample data using the cluster assignments z, means and cluster covariances
x = np.asarray([np.random.multivariate_normal(mu[z_i],
cluster_covs[c_sp[z_i]]) for z_i in z])
return x,z