-
Notifications
You must be signed in to change notification settings - Fork 74
Update normalization parameters and add estimator params validation #210
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
567b241
fd6e4a5
ac310c1
7df7818
5536db7
a2a7515
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,21 +61,24 @@ | |
| { | ||
| "data": { | ||
| "dataset": "hepmass", | ||
| "split_kwargs": { "train_size": 0.1, "test_size": null } | ||
| "split_kwargs": { "train_size": 0.1, "test_size": null }, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like this is the only case where benchmark behavior changes - is it intended?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it was done for a reason but let me check the convergence for both options |
||
| "preprocessing_kwargs": { "normalize": "standard" } | ||
| }, | ||
| "algorithm": { "estimator_params": {"C": 1e-5} } | ||
| }, | ||
| { | ||
| "data": { | ||
| "dataset": "cifar", | ||
| "split_kwargs": { "train_size": 0.1, "test_size": null } | ||
| "split_kwargs": { "train_size": 0.1, "test_size": null }, | ||
| "preprocessing_kwargs": { "normalize": "mean" } | ||
| }, | ||
| "algorithm": { "estimator_params": {"C": 1e-9} } | ||
| }, | ||
| { | ||
| "data": { | ||
| "dataset": "gisette", | ||
| "split_kwargs": { "train_size": 2000, "test_size": null } | ||
| "split_kwargs": { "train_size": 2000, "test_size": null }, | ||
| "preprocessing_kwargs": { "normalize": "standard" } | ||
| }, | ||
| "algorithm": { "estimator_params": {"C": 1e1} } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -15,7 +15,7 @@ | |||||
| "algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } } | ||||||
| }, | ||||||
| { | ||||||
| "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }, | ||||||
| "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } }, | ||||||
| "algorithm": { | ||||||
| "estimator_params": { "C": 100.0, "kernel": ["linear", "poly", "rbf"] } | ||||||
| } | ||||||
|
|
@@ -30,7 +30,7 @@ | |||||
| "data": { | ||||||
| "dataset": "mnist", | ||||||
| "split_kwargs": { "train_size": 20000, "test_size": null }, | ||||||
| "preprocessing_kwargs": { "normalize": false } | ||||||
| "preprocessing_kwargs": {"normalize" : null} | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Minor adjustment but looks like spaces have been added before colons in a few places throughout the configs |
||||||
| }, | ||||||
| "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } } | ||||||
| } | ||||||
|
|
@@ -45,7 +45,7 @@ | |||||
| "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } } | ||||||
| }, | ||||||
| { | ||||||
| "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }, | ||||||
| "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } }, | ||||||
| "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } } | ||||||
| }, | ||||||
| { | ||||||
|
|
@@ -75,7 +75,7 @@ | |||||
| "algorithm": { "estimator_params": { "nu": 0.1, "kernel": "rbf" } } | ||||||
| }, | ||||||
| { | ||||||
| "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }, | ||||||
| "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } }, | ||||||
| "algorithm": { "estimator_params": { "nu": 0.9, "kernel": ["linear", "rbf"] } } | ||||||
| } | ||||||
| ], | ||||||
|
|
@@ -89,7 +89,7 @@ | |||||
| "algorithm": { "estimator_params": { "nu": 0.8, "C": 2.0, "kernel": "rbf" } } | ||||||
| }, | ||||||
| { | ||||||
| "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }, | ||||||
| "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } }, | ||||||
| "algorithm": { "estimator_params": { "nu": 0.9, "C": 1.0, "kernel": "rbf" } } | ||||||
| }, | ||||||
| { | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -4,14 +4,19 @@ | |||||||||
| "high-load dbscan datasets": { | ||||||||||
| "data": [ | ||||||||||
| { | ||||||||||
| "dataset": ["cifar", "road_network", "covtype"], | ||||||||||
| "dataset" : "cifar", | ||||||||||
| "split_kwargs": { "ignore" : true }, | ||||||||||
|
Comment on lines
+7
to
+8
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||
| "preprocessing_kwargs": { "normalize": "mean" } | ||||||||||
| }, | ||||||||||
| { | ||||||||||
| "dataset": ["road_network", "covtype"], | ||||||||||
| "split_kwargs": { "ignore": true }, | ||||||||||
| "preprocessing_kwargs": { "normalize": true } | ||||||||||
| "preprocessing_kwargs": { "normalize": "standard" } | ||||||||||
| }, | ||||||||||
| { | ||||||||||
| "dataset": "susy", | ||||||||||
| "split_kwargs": { "train_size": 800000 }, | ||||||||||
| "preprocessing_kwargs": { "normalize": true } | ||||||||||
| "preprocessing_kwargs": { "normalize": "standard" } | ||||||||||
| }, | ||||||||||
| { | ||||||||||
| "source": "make_blobs", | ||||||||||
|
|
||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ pandas | |
| tabulate | ||
| fastparquet | ||
| h5py | ||
| openml | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like you need to rebase from the current main. |
||
| openpyxl | ||
| tqdm | ||
| psutil | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,12 @@ | |
| import pandas as pd | ||
| from scipy.sparse import csr_matrix | ||
| from sklearn.model_selection import train_test_split | ||
| from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder | ||
| from sklearn.preprocessing import ( | ||
| MinMaxScaler, | ||
| OneHotEncoder, | ||
| OrdinalEncoder, | ||
| StandardScaler, | ||
| ) | ||
|
|
||
| from ..utils.custom_types import Array | ||
| from ..utils.logger import logger | ||
|
|
@@ -167,7 +172,7 @@ def preprocess_x( | |
| x: Array, | ||
| replace_nan="auto", | ||
| category_encoding="ordinal", | ||
| normalize=False, | ||
| normalize=None, | ||
| force_for_sparse=True, | ||
| **kwargs, | ||
| ) -> Array: | ||
|
|
@@ -219,9 +224,18 @@ def preprocess_x( | |
| pass | ||
| else: | ||
| logger.warning(f'Unknown "{category_encoding}" category encoding type.') | ||
| # Mean-Standard normalization | ||
| # Normalization | ||
| if normalize: | ||
| x = (x - x.mean()) / x.std() | ||
| if normalize == "standard": | ||
| scaler = StandardScaler(with_mean=True, with_std=True) | ||
| elif normalize == "mean": | ||
| scaler = StandardScaler(with_mean=True, with_std=False) | ||
| elif normalize == "minmax": | ||
| scaler = MinMaxScaler(feature_range=(0, 1)) | ||
| else: | ||
| logger.warning(f'Unknown "{normalize}" normalization type.') | ||
| if scaler is not None: | ||
| return pd.DataFrame(scaler.fit_transform(x), columns=x.columns, index=x.index) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't this make it ignore |
||
| if return_type == np.ndarray: | ||
| return x.values | ||
| else: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would the jsons with the datasets be able to override this? For example:
https://github.com/avolkov-intel/scikit-learn_bench/blob/a2a75152d6bae2cf3bcdf350125369b04f289b33/configs/regular/knn.json#L8
It's not clear from the docs how it would work when specified more than once.