NGO-Algorithm-Audit
diff --git a/‎datasets/socialdiagnosis/data/SocialDiagnosis2011.csv‎
Lines changed: 5001 additions & 0 deletions b/‎datasets/socialdiagnosis/data/SocialDiagnosis2011.csv‎
Lines changed: 5001 additions & 0 deletions
diff --git a/‎synthpop/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎synthpop/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎synthpop/method/__init__.py‎
Lines changed: 19 additions & 8 deletions b/‎synthpop/method/__init__.py‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎synthpop/method/base.py‎
Lines changed: 0 additions & 57 deletions b/‎synthpop/method/base.py‎
Lines changed: 0 additions & 57 deletions
diff --git a/‎synthpop/method/cart.py‎
Lines changed: 1 addition & 1 deletion b/‎synthpop/method/cart.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎synthpop/method/empty.py‎
Lines changed: 0 additions & 18 deletions b/‎synthpop/method/empty.py‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎synthpop/method/gaussian_copula.py‎
Lines changed: 106 additions & 0 deletions b/‎synthpop/method/gaussian_copula.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎synthpop/method/norm.py‎
Lines changed: 0 additions & 54 deletions b/‎synthpop/method/norm.py‎
Lines changed: 0 additions & 54 deletions
diff --git a/‎synthpop/method/normrank.py‎
Lines changed: 0 additions & 50 deletions b/‎synthpop/method/normrank.py‎
Lines changed: 0 additions & 50 deletions
@@ -2,3 +2,4 @@
 CAT_COLS_DTYPES = ['category', 'bool']
 
 from synthpop.synthpop import Synthpop
+print('yes')
@@ -6,7 +6,7 @@
 from synthpop.method.norm import NormMethod
 from synthpop.method.normrank import NormRankMethod
 from synthpop.method.polyreg import PolyregMethod
-
+from synthpop.method.gaussian_copula import GaussianCopulaMethod
 
 EMPTY_METHOD = ''
 SAMPLE_METHOD = 'sample'
@@ -17,21 +17,23 @@
 NORM_METHOD = 'norm'
 NORMRANK_METHOD = 'normrank'
 POLYREG_METHOD = 'polyreg'
+GC_METHOD = 'gaussian copula' 
 
 
 METHODS_MAP = {EMPTY_METHOD: EmptyMethod,
                SAMPLE_METHOD: SampleMethod,
                CART_METHOD: CARTMethod,
                NORM_METHOD: NormMethod,
                NORMRANK_METHOD: NormRankMethod,
-               POLYREG_METHOD: PolyregMethod
+               POLYREG_METHOD: PolyregMethod,
+               GC_METHOD: GaussianCopulaMethod
                }
 
 
-ALL_METHODS = (EMPTY_METHOD, SAMPLE_METHOD, CART_METHOD, PARAMETRIC_METHOD, NORM_METHOD, NORMRANK_METHOD, POLYREG_METHOD)
-DEFAULT_METHODS = (CART_METHOD, PARAMETRIC_METHOD)
-INIT_METHODS = (SAMPLE_METHOD, CART_METHOD, PARAMETRIC_METHOD)
-NA_METHODS = (CART_METHOD, NORM_METHOD, NORMRANK_METHOD, POLYREG_METHOD)
+ALL_METHODS = (EMPTY_METHOD, SAMPLE_METHOD, CART_METHOD, PARAMETRIC_METHOD, NORM_METHOD, NORMRANK_METHOD, POLYREG_METHOD, GC_METHOD)
+DEFAULT_METHODS = (CART_METHOD, PARAMETRIC_METHOD, GC_METHOD)
+INIT_METHODS = (SAMPLE_METHOD, CART_METHOD, PARAMETRIC_METHOD, GC_METHOD)
+NA_METHODS = (CART_METHOD, NORM_METHOD, NORMRANK_METHOD, POLYREG_METHOD, GC_METHOD)
 
 
 # method maps
@@ -49,6 +51,13 @@
                    'category': CART_METHOD
                    }
 
+GC_METHOD_MAP = {'int': GC_METHOD,
+                 'float': GC_METHOD,
+                 'datetime': GC_METHOD,
+                 'bool': GC_METHOD,
+                 'category': GC_METHOD
+                 }
+
 SAMPLE_METHOD_MAP = {'int': SAMPLE_METHOD,
                      'float': SAMPLE_METHOD,
                      'datetime': SAMPLE_METHOD,
@@ -57,7 +66,8 @@
                      }
 
 DEFAULT_METHODS_MAP = {CART_METHOD: CART_METHOD_MAP,
-                       PARAMETRIC_METHOD: PARAMETRIC_METHOD_MAP
+                       PARAMETRIC_METHOD: PARAMETRIC_METHOD_MAP,
+                       GC_METHOD: GC_METHOD_MAP
                        }
 
 
@@ -68,5 +78,6 @@
 CONT_TO_CAT_METHODS_MAP = {CART_METHOD: CART_METHOD,
                            NORM_METHOD: POLYREG_METHOD,
                            NORMRANK_METHOD: POLYREG_METHOD,
-                           POLYREG_METHOD: POLYREG_METHOD
+                           POLYREG_METHOD: POLYREG_METHOD,
+                           GC_METHOD: GC_METHOD
                            }
@@ -2,7 +2,7 @@
 import pandas as pd
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 
-from synthpop.method import Method, proper, smooth
+from synthpop.method import Method, proper, smooth 
 # global variables
 from synthpop import NUM_COLS_DTYPES, CAT_COLS_DTYPES
 
 
@@ -0,0 +1,106 @@
+import numpy as np
+import pandas as pd
+from scipy.stats import norm, ks_2samp
+# from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+
+from synthpop.method import Method, proper, smooth
+# global variables
+# from synthpop import NUM_COLS_DTYPES, CAT_COLS_DTYPES
+
+
+class GaussianCopulaMethod(Method): 
+    def __init__(self, dtype, smoothing=False, proper=False, minibucket=5, random_state=None, *args, **kwargs):
+        self.dtype = dtype
+        self.proper = proper
+        self.random_state = random_state
+        self.smoothing = smoothing
+        self.minibucket = minibucket
+
+        # learnt parameters
+        self.means = None
+        self.cov_matrix = None
+        self.scaler = None
+        self.data_marginals = None 
+
+    def fit(self, data):
+        """
+        Fit the Gaussian Copula model to the given data.
+        """
+        # Step 1: Store data marginals (quantiles for each feature)
+        self.data_marginals = []
+        for col in data.columns:
+            sorted_data = np.sort(data[col])
+            quantiles = np.linspace(0, 1, len(sorted_data))
+            self.data_marginals.append((sorted_data, quantiles, col))
+
+        # Step 2: Convert data to normal distribution using CDF (Gaussianization)
+        uniform_data = data.rank(pct=True)  # Get percentile rank for each column (empirical CDF)
+        gaussian_data = norm.ppf(uniform_data)  # Convert uniform to standard normal
+
+        # Step 3: Fit a multivariate Gaussian to the normalized data
+        self.means = gaussian_data.mean(axis=0)
+        self.cov_matrix = np.cov(gaussian_data, rowvar=False)
+
+    def predict(self, n_samples):
+        """
+        Generate synthetic data using the fitted Gaussian Copula model.
+        """
+        # Step 1: Sample from the multivariate normal distribution
+        synthetic_gaussian = np.random.multivariate_normal(self.means, self.cov_matrix, n_samples)
+
+        # Step 2: Convert back to uniform distribution using CDF (normal -> uniform)
+        synthetic_uniform = norm.cdf(synthetic_gaussian)
+
+        # Step 3: Map uniform data back to the original marginals
+        synthetic_data = pd.DataFrame()
+        for i, (sorted_data, quantiles, col) in enumerate(self.data_marginals):
+            synthetic_data[col] = np.interp(synthetic_uniform[:, i], quantiles, sorted_data)
+
+        return synthetic_data
+    
+def evaluate_distribution(real_data, synthetic_data):
+    """
+    Compare the distribution of each column in the real and synthetic data using
+    the Kolmogorov-Smirnov (KS) test.
+    """
+    results = {}
+    for column in real_data.columns:
+        real_col = real_data[column].dropna()
+        synthetic_col = synthetic_data[column].dropna()
+
+        # Perform the KS test
+        ks_stat, p_value = ks_2samp(real_col, synthetic_col)
+
+        # Store the result
+        results[column] = {'ks_stat': ks_stat, 'p_value': p_value}
+    return results
+
+def evaluate_correlations(real_data, synthetic_data):
+    """
+    Compare the pairwise correlation matrices of the real and synthetic data.
+    """
+    real_corr = real_data.corr()
+    synthetic_corr = synthetic_data.corr()
+
+    # Compute the difference between the correlation matrices
+    corr_diff = np.abs(real_corr - synthetic_corr)
+    return corr_diff.mean().mean()  # Average correlation difference
+
+def run_diagnostic(real_data, synthetic_data, target_column):
+    """
+    Run diagnostics on synthetic data by evaluating distribution, correlations, and
+    classification model performance.
+    """
+    # Step 1: Evaluate distributions
+    distribution_results = evaluate_distribution(real_data, synthetic_data)
+
+    # Step 2: Evaluate correlations
+    correlation_diff = evaluate_correlations(real_data, synthetic_data)
+
+    # Aggregate results
+    diagnostics = {
+        'distribution_results': distribution_results,
+        'correlation_diff': correlation_diff
+    }
+
+    return diagnostics
Original file line number	Diff line number	Diff line change
`@@ -2,3 +2,4 @@`
`2`	`2`	`CAT_COLS_DTYPES = ['category', 'bool']`
`3`	`3`
`4`	`4`	`from synthpop.synthpop import Synthpop`
	`5`	`+print('yes')`