-
-
Notifications
You must be signed in to change notification settings - Fork 50.5k
Add naive bayes classifier #14065
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Add naive bayes classifier #14065
Changes from 3 commits
3c5e410
6dd885c
5d3907f
1526ca9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| """ | ||
| Multinomial Naive Bayes Classifier implementation. | ||
|
|
||
| This module implements Multinomial Naive Bayes from scratch without using | ||
| external machine learning libraries. It is commonly used for text | ||
| classification tasks such as spam detection. | ||
|
|
||
| References: | ||
| https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_bayes | ||
| """ | ||
|
|
||
| import math | ||
|
|
||
|
|
||
| class MultinomialNaiveBayes: | ||
| """ | ||
| Multinomial Naive Bayes classifier. | ||
| """ | ||
|
|
||
| def __init__(self, alpha: float = 1.0) -> None: | ||
| """ | ||
| Initialize the classifier. | ||
|
|
||
| :param alpha: Laplace smoothing parameter | ||
| """ | ||
| if alpha <= 0: | ||
| raise ValueError("Alpha must be greater than 0") | ||
|
|
||
| self.alpha = alpha | ||
| self.class_priors: dict[int, float] = {} | ||
| self.feature_log_prob: dict[int, list[float]] = {} | ||
| self.num_features: int = 0 | ||
|
|
||
| def fit(self, features: list[list[int]], labels: list[int]) -> None: | ||
| """ | ||
| Train the Multinomial Naive Bayes classifier. | ||
|
|
||
| :param features: Feature matrix (counts of features) | ||
| :param labels: Class labels | ||
| :raises ValueError: If input sizes mismatch | ||
|
|
||
| >>> model = MultinomialNaiveBayes() | ||
| >>> X = [[2, 1], [1, 1], [0, 2]] | ||
| >>> y = [0, 0, 1] | ||
| >>> model.fit(X, y) | ||
| """ | ||
| if len(features) != len(labels): | ||
| raise ValueError("Features and labels must have the same length") | ||
|
|
||
| if not features: | ||
| raise ValueError("Feature matrix must not be empty") | ||
|
|
||
| self.num_features = len(features[0]) | ||
|
|
||
| separated: dict[int, list[list[int]]] = {} | ||
| for row, label in zip(features, labels): | ||
| separated.setdefault(label, []).append(row) | ||
|
|
||
| total_samples = len(labels) | ||
|
|
||
| for label, rows in separated.items(): | ||
| self.class_priors[label] = math.log(len(rows) / total_samples) | ||
|
|
||
| feature_counts = [0] * self.num_features | ||
| total_count = 0 | ||
|
|
||
| for row in rows: | ||
| for index, value in enumerate(row): | ||
| feature_counts[index] += value | ||
| total_count += value | ||
|
|
||
| self.feature_log_prob[label] = [ | ||
| math.log( | ||
| (count + self.alpha) | ||
| / (total_count + self.alpha * self.num_features) | ||
| ) | ||
| for count in feature_counts | ||
| ] | ||
|
|
||
| def predict(self, features: list[list[int]]) -> list[int]: | ||
| """ | ||
| Predict class labels for input features. | ||
|
|
||
| :param features: Feature matrix | ||
| :return: Predicted labels | ||
|
|
||
| >>> model = MultinomialNaiveBayes() | ||
| >>> X = [[2, 1], [1, 1], [0, 2]] | ||
| >>> y = [0, 0, 1] | ||
| >>> model.fit(X, y) | ||
| >>> model.predict([[1, 0], [0, 2]]) | ||
| [0, 1] | ||
| """ | ||
| predictions: list[int] = [] | ||
|
|
||
| for row in features: | ||
| class_scores: dict[int, float] = {} | ||
|
|
||
| for label in self.class_priors: | ||
| score = self.class_priors[label] | ||
|
|
||
| for index, value in enumerate(row): | ||
| score += value * self.feature_log_prob[label][index] | ||
|
|
||
| class_scores[label] = score | ||
|
|
||
| predicted_label = max( | ||
| class_scores.items(), | ||
| key=lambda item: item[1], | ||
| )[0] | ||
| predictions.append(predicted_label) | ||
|
|
||
| return predictions |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,107 @@ | ||
| """ | ||
| Naive Bayes Classifier implementation. | ||
|
|
||
| This module implements Gaussian Naive Bayes from scratch without using | ||
| external machine learning libraries. | ||
|
|
||
| References: | ||
| https://en.wikipedia.org/wiki/Naive_Bayes_classifier | ||
| """ | ||
|
|
||
| from typing import Dict, List, Tuple | ||
|
Check failure on line 11 in machine_learning/naive_bayes.py
|
||
| import math | ||
|
Check failure on line 12 in machine_learning/naive_bayes.py
|
||
|
|
||
|
|
||
| def gaussian_probability(x: float, mean: float, variance: float) -> float: | ||
| """ | ||
| Calculate Gaussian probability density. | ||
|
|
||
| >>> round(gaussian_probability(1.0, 1.0, 1.0), 3) | ||
| 0.399 | ||
| >>> gaussian_probability(1.0, 1.0, 0.0) | ||
| 0.0 | ||
| """ | ||
| if variance == 0.0: | ||
| return 0.0 | ||
|
|
||
| exponent = math.exp(-((x - mean) ** 2) / (2.0 * variance)) | ||
| coefficient = 1.0 / math.sqrt(2.0 * math.pi * variance) | ||
| return coefficient * exponent | ||
|
|
||
|
|
||
| class GaussianNaiveBayes: | ||
| """ | ||
| Gaussian Naive Bayes classifier. | ||
| """ | ||
|
|
||
| def __init__(self) -> None: | ||
| self.class_priors: Dict[int, float] = {} | ||
|
Check failure on line 38 in machine_learning/naive_bayes.py
|
||
| self.means: Dict[int, List[float]] = {} | ||
|
Check failure on line 39 in machine_learning/naive_bayes.py
|
||
| self.variances: Dict[int, List[float]] = {} | ||
|
Check failure on line 40 in machine_learning/naive_bayes.py
|
||
|
|
||
| def fit(self, features: List[List[float]], labels: List[int]) -> None: | ||
|
Check failure on line 42 in machine_learning/naive_bayes.py
|
||
| """ | ||
| Train the Gaussian Naive Bayes classifier. | ||
|
|
||
| :param features: Feature matrix | ||
| :param labels: Class labels | ||
| :raises ValueError: If input sizes mismatch | ||
|
|
||
| >>> model = GaussianNaiveBayes() | ||
| >>> model.fit([[1.0], [2.0], [3.0]], [0, 0, 1]) | ||
| """ | ||
| if len(features) != len(labels): | ||
| raise ValueError("Features and labels must have the same length") | ||
|
|
||
| separated: Dict[int, List[List[float]]] = {} | ||
| for feature_vector, label in zip(features, labels): | ||
| separated.setdefault(label, []).append(feature_vector) | ||
|
|
||
| total_samples = len(labels) | ||
|
|
||
| for label, rows in separated.items(): | ||
| self.class_priors[label] = len(rows) / total_samples | ||
|
|
||
| columns = list(zip(*rows)) | ||
| self.means[label] = [sum(col) / len(col) for col in columns] | ||
| self.variances[label] = [ | ||
| sum((x - mean) ** 2 for x in col) / len(col) | ||
| for col, mean in zip(columns, self.means[label]) | ||
| ] | ||
|
|
||
| def predict(self, features: List[List[float]]) -> List[int]: | ||
| """ | ||
| Predict class labels for input features. | ||
|
|
||
| :param features: Feature matrix | ||
| :return: Predicted labels | ||
|
|
||
| >>> model = GaussianNaiveBayes() | ||
| >>> X = [[1.0], [2.0], [3.0], [4.0]] | ||
| >>> y = [0, 0, 1, 1] | ||
| >>> model.fit(X, y) | ||
| >>> model.predict([[1.5], [3.5]]) | ||
| [0, 1] | ||
| """ | ||
| predictions: List[int] = [] | ||
|
|
||
| for row in features: | ||
| scores: List[Tuple[int, float]] = [] | ||
|
|
||
| for label in self.class_priors: | ||
| log_likelihood = math.log(self.class_priors[label]) | ||
|
|
||
| for index, value in enumerate(row): | ||
| probability = gaussian_probability( | ||
| value, | ||
| self.means[label][index], | ||
| self.variances[label][index], | ||
| ) | ||
| if probability > 0.0: | ||
| log_likelihood += math.log(probability) | ||
|
|
||
| scores.append((label, log_likelihood)) | ||
|
|
||
| predictions.append(max(scores, key=lambda pair: pair[1])[0]) | ||
|
|
||
| return predictions | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please provide descriptive name for the parameter:
x