1+ import numpy as np
2+ import pandas as pd
3+ from scipy .stats import norm , ks_2samp
4+ # from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
5+
6+ from synthpop .method import Method , proper , smooth
7+ # global variables
8+ # from synthpop import NUM_COLS_DTYPES, CAT_COLS_DTYPES
9+
10+
11+ class GaussianCopulaMethod (Method ):
12+ def __init__ (self , dtype , smoothing = False , proper = False , minibucket = 5 , random_state = None , * args , ** kwargs ):
13+ self .dtype = dtype
14+ self .proper = proper
15+ self .random_state = random_state
16+ self .smoothing = smoothing
17+ self .minibucket = minibucket
18+
19+ # learnt parameters
20+ self .means = None
21+ self .cov_matrix = None
22+ self .scaler = None
23+ self .data_marginals = None
24+
25+ def fit (self , data ):
26+ """
27+ Fit the Gaussian Copula model to the given data.
28+ """
29+ # Step 1: Store data marginals (quantiles for each feature)
30+ self .data_marginals = []
31+ for col in data .columns :
32+ sorted_data = np .sort (data [col ])
33+ quantiles = np .linspace (0 , 1 , len (sorted_data ))
34+ self .data_marginals .append ((sorted_data , quantiles , col ))
35+
36+ # Step 2: Convert data to normal distribution using CDF (Gaussianization)
37+ uniform_data = data .rank (pct = True ) # Get percentile rank for each column (empirical CDF)
38+ gaussian_data = norm .ppf (uniform_data ) # Convert uniform to standard normal
39+
40+ # Step 3: Fit a multivariate Gaussian to the normalized data
41+ self .means = gaussian_data .mean (axis = 0 )
42+ self .cov_matrix = np .cov (gaussian_data , rowvar = False )
43+
44+ def predict (self , n_samples ):
45+ """
46+ Generate synthetic data using the fitted Gaussian Copula model.
47+ """
48+ # Step 1: Sample from the multivariate normal distribution
49+ synthetic_gaussian = np .random .multivariate_normal (self .means , self .cov_matrix , n_samples )
50+
51+ # Step 2: Convert back to uniform distribution using CDF (normal -> uniform)
52+ synthetic_uniform = norm .cdf (synthetic_gaussian )
53+
54+ # Step 3: Map uniform data back to the original marginals
55+ synthetic_data = pd .DataFrame ()
56+ for i , (sorted_data , quantiles , col ) in enumerate (self .data_marginals ):
57+ synthetic_data [col ] = np .interp (synthetic_uniform [:, i ], quantiles , sorted_data )
58+
59+ return synthetic_data
60+
61+ def evaluate_distribution (real_data , synthetic_data ):
62+ """
63+ Compare the distribution of each column in the real and synthetic data using
64+ the Kolmogorov-Smirnov (KS) test.
65+ """
66+ results = {}
67+ for column in real_data .columns :
68+ real_col = real_data [column ].dropna ()
69+ synthetic_col = synthetic_data [column ].dropna ()
70+
71+ # Perform the KS test
72+ ks_stat , p_value = ks_2samp (real_col , synthetic_col )
73+
74+ # Store the result
75+ results [column ] = {'ks_stat' : ks_stat , 'p_value' : p_value }
76+ return results
77+
78+ def evaluate_correlations (real_data , synthetic_data ):
79+ """
80+ Compare the pairwise correlation matrices of the real and synthetic data.
81+ """
82+ real_corr = real_data .corr ()
83+ synthetic_corr = synthetic_data .corr ()
84+
85+ # Compute the difference between the correlation matrices
86+ corr_diff = np .abs (real_corr - synthetic_corr )
87+ return corr_diff .mean ().mean () # Average correlation difference
88+
89+ def run_diagnostic (real_data , synthetic_data , target_column ):
90+ """
91+ Run diagnostics on synthetic data by evaluating distribution, correlations, and
92+ classification model performance.
93+ """
94+ # Step 1: Evaluate distributions
95+ distribution_results = evaluate_distribution (real_data , synthetic_data )
96+
97+ # Step 2: Evaluate correlations
98+ correlation_diff = evaluate_correlations (real_data , synthetic_data )
99+
100+ # Aggregate results
101+ diagnostics = {
102+ 'distribution_results' : distribution_results ,
103+ 'correlation_diff' : correlation_diff
104+ }
105+
106+ return diagnostics
0 commit comments