Skip to content

Commit 888abe0

Browse files
committed
Removed abundant old methods
1 parent 570e79c commit 888abe0

13 files changed

Lines changed: 5134 additions & 266 deletions

File tree

datasets/socialdiagnosis/data/SocialDiagnosis2011.csv

Lines changed: 5001 additions & 0 deletions
Large diffs are not rendered by default.

synthpop/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
CAT_COLS_DTYPES = ['category', 'bool']
33

44
from synthpop.synthpop import Synthpop
5+
print('yes')

synthpop/method/__init__.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from synthpop.method.norm import NormMethod
77
from synthpop.method.normrank import NormRankMethod
88
from synthpop.method.polyreg import PolyregMethod
9-
9+
from synthpop.method.gaussian_copula import GaussianCopulaMethod
1010

1111
EMPTY_METHOD = ''
1212
SAMPLE_METHOD = 'sample'
@@ -17,21 +17,23 @@
1717
NORM_METHOD = 'norm'
1818
NORMRANK_METHOD = 'normrank'
1919
POLYREG_METHOD = 'polyreg'
20+
GC_METHOD = 'gaussian copula'
2021

2122

2223
METHODS_MAP = {EMPTY_METHOD: EmptyMethod,
2324
SAMPLE_METHOD: SampleMethod,
2425
CART_METHOD: CARTMethod,
2526
NORM_METHOD: NormMethod,
2627
NORMRANK_METHOD: NormRankMethod,
27-
POLYREG_METHOD: PolyregMethod
28+
POLYREG_METHOD: PolyregMethod,
29+
GC_METHOD: GaussianCopulaMethod
2830
}
2931

3032

31-
ALL_METHODS = (EMPTY_METHOD, SAMPLE_METHOD, CART_METHOD, PARAMETRIC_METHOD, NORM_METHOD, NORMRANK_METHOD, POLYREG_METHOD)
32-
DEFAULT_METHODS = (CART_METHOD, PARAMETRIC_METHOD)
33-
INIT_METHODS = (SAMPLE_METHOD, CART_METHOD, PARAMETRIC_METHOD)
34-
NA_METHODS = (CART_METHOD, NORM_METHOD, NORMRANK_METHOD, POLYREG_METHOD)
33+
ALL_METHODS = (EMPTY_METHOD, SAMPLE_METHOD, CART_METHOD, PARAMETRIC_METHOD, NORM_METHOD, NORMRANK_METHOD, POLYREG_METHOD, GC_METHOD)
34+
DEFAULT_METHODS = (CART_METHOD, PARAMETRIC_METHOD, GC_METHOD)
35+
INIT_METHODS = (SAMPLE_METHOD, CART_METHOD, PARAMETRIC_METHOD, GC_METHOD)
36+
NA_METHODS = (CART_METHOD, NORM_METHOD, NORMRANK_METHOD, POLYREG_METHOD, GC_METHOD)
3537

3638

3739
# method maps
@@ -49,6 +51,13 @@
4951
'category': CART_METHOD
5052
}
5153

54+
GC_METHOD_MAP = {'int': GC_METHOD,
55+
'float': GC_METHOD,
56+
'datetime': GC_METHOD,
57+
'bool': GC_METHOD,
58+
'category': GC_METHOD
59+
}
60+
5261
SAMPLE_METHOD_MAP = {'int': SAMPLE_METHOD,
5362
'float': SAMPLE_METHOD,
5463
'datetime': SAMPLE_METHOD,
@@ -57,7 +66,8 @@
5766
}
5867

5968
DEFAULT_METHODS_MAP = {CART_METHOD: CART_METHOD_MAP,
60-
PARAMETRIC_METHOD: PARAMETRIC_METHOD_MAP
69+
PARAMETRIC_METHOD: PARAMETRIC_METHOD_MAP,
70+
GC_METHOD: GC_METHOD_MAP
6171
}
6272

6373

@@ -68,5 +78,6 @@
6878
CONT_TO_CAT_METHODS_MAP = {CART_METHOD: CART_METHOD,
6979
NORM_METHOD: POLYREG_METHOD,
7080
NORMRANK_METHOD: POLYREG_METHOD,
71-
POLYREG_METHOD: POLYREG_METHOD
81+
POLYREG_METHOD: POLYREG_METHOD,
82+
GC_METHOD: GC_METHOD
7283
}

synthpop/method/base.py

Lines changed: 0 additions & 57 deletions
This file was deleted.

synthpop/method/cart.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import pandas as pd
33
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
44

5-
from synthpop.method import Method, proper, smooth
5+
from synthpop.method import Method, proper, smooth
66
# global variables
77
from synthpop import NUM_COLS_DTYPES, CAT_COLS_DTYPES
88

synthpop/method/empty.py

Lines changed: 0 additions & 18 deletions
This file was deleted.

synthpop/method/gaussian_copula.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import numpy as np
2+
import pandas as pd
3+
from scipy.stats import norm, ks_2samp
4+
# from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
5+
6+
from synthpop.method import Method, proper, smooth
7+
# global variables
8+
# from synthpop import NUM_COLS_DTYPES, CAT_COLS_DTYPES
9+
10+
11+
class GaussianCopulaMethod(Method):
12+
def __init__(self, dtype, smoothing=False, proper=False, minibucket=5, random_state=None, *args, **kwargs):
13+
self.dtype = dtype
14+
self.proper = proper
15+
self.random_state = random_state
16+
self.smoothing = smoothing
17+
self.minibucket = minibucket
18+
19+
# learnt parameters
20+
self.means = None
21+
self.cov_matrix = None
22+
self.scaler = None
23+
self.data_marginals = None
24+
25+
def fit(self, data):
26+
"""
27+
Fit the Gaussian Copula model to the given data.
28+
"""
29+
# Step 1: Store data marginals (quantiles for each feature)
30+
self.data_marginals = []
31+
for col in data.columns:
32+
sorted_data = np.sort(data[col])
33+
quantiles = np.linspace(0, 1, len(sorted_data))
34+
self.data_marginals.append((sorted_data, quantiles, col))
35+
36+
# Step 2: Convert data to normal distribution using CDF (Gaussianization)
37+
uniform_data = data.rank(pct=True) # Get percentile rank for each column (empirical CDF)
38+
gaussian_data = norm.ppf(uniform_data) # Convert uniform to standard normal
39+
40+
# Step 3: Fit a multivariate Gaussian to the normalized data
41+
self.means = gaussian_data.mean(axis=0)
42+
self.cov_matrix = np.cov(gaussian_data, rowvar=False)
43+
44+
def predict(self, n_samples):
45+
"""
46+
Generate synthetic data using the fitted Gaussian Copula model.
47+
"""
48+
# Step 1: Sample from the multivariate normal distribution
49+
synthetic_gaussian = np.random.multivariate_normal(self.means, self.cov_matrix, n_samples)
50+
51+
# Step 2: Convert back to uniform distribution using CDF (normal -> uniform)
52+
synthetic_uniform = norm.cdf(synthetic_gaussian)
53+
54+
# Step 3: Map uniform data back to the original marginals
55+
synthetic_data = pd.DataFrame()
56+
for i, (sorted_data, quantiles, col) in enumerate(self.data_marginals):
57+
synthetic_data[col] = np.interp(synthetic_uniform[:, i], quantiles, sorted_data)
58+
59+
return synthetic_data
60+
61+
def evaluate_distribution(real_data, synthetic_data):
62+
"""
63+
Compare the distribution of each column in the real and synthetic data using
64+
the Kolmogorov-Smirnov (KS) test.
65+
"""
66+
results = {}
67+
for column in real_data.columns:
68+
real_col = real_data[column].dropna()
69+
synthetic_col = synthetic_data[column].dropna()
70+
71+
# Perform the KS test
72+
ks_stat, p_value = ks_2samp(real_col, synthetic_col)
73+
74+
# Store the result
75+
results[column] = {'ks_stat': ks_stat, 'p_value': p_value}
76+
return results
77+
78+
def evaluate_correlations(real_data, synthetic_data):
79+
"""
80+
Compare the pairwise correlation matrices of the real and synthetic data.
81+
"""
82+
real_corr = real_data.corr()
83+
synthetic_corr = synthetic_data.corr()
84+
85+
# Compute the difference between the correlation matrices
86+
corr_diff = np.abs(real_corr - synthetic_corr)
87+
return corr_diff.mean().mean() # Average correlation difference
88+
89+
def run_diagnostic(real_data, synthetic_data, target_column):
90+
"""
91+
Run diagnostics on synthetic data by evaluating distribution, correlations, and
92+
classification model performance.
93+
"""
94+
# Step 1: Evaluate distributions
95+
distribution_results = evaluate_distribution(real_data, synthetic_data)
96+
97+
# Step 2: Evaluate correlations
98+
correlation_diff = evaluate_correlations(real_data, synthetic_data)
99+
100+
# Aggregate results
101+
diagnostics = {
102+
'distribution_results': distribution_results,
103+
'correlation_diff': correlation_diff
104+
}
105+
106+
return diagnostics

synthpop/method/norm.py

Lines changed: 0 additions & 54 deletions
This file was deleted.

synthpop/method/normrank.py

Lines changed: 0 additions & 50 deletions
This file was deleted.

0 commit comments

Comments
 (0)