-
-
Notifications
You must be signed in to change notification settings - Fork 50.5k
Expand file tree
/
Copy pathnaive_bayes.py
More file actions
107 lines (81 loc) · 3.23 KB
/
naive_bayes.py
File metadata and controls
107 lines (81 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Naive Bayes Classifier implementation.
This module implements Gaussian Naive Bayes from scratch without using
external machine learning libraries.
References:
https://en.wikipedia.org/wiki/Naive_Bayes_classifier
"""
import math
def gaussian_probability(value: float, mean: float, variance: float) -> float:
"""
Calculate Gaussian probability density.
>>> round(gaussian_probability(1.0, 1.0, 1.0), 3)
0.399
>>> gaussian_probability(1.0, 1.0, 0.0)
0.0
"""
if variance == 0.0:
return 0.0
exponent = math.exp(-((value - mean) ** 2) / (2.0 * variance))
coefficient = 1.0 / math.sqrt(2.0 * math.pi * variance)
return coefficient * exponent
class GaussianNaiveBayes:
"""
Gaussian Naive Bayes classifier.
"""
def __init__(self) -> None:
self.class_priors: dict[int, float] = {}
self.means: dict[int, list[float]] = {}
self.variances: dict[int, list[float]] = {}
def fit(self, features: list[list[float]], labels: list[int]) -> None:
"""
Train the Gaussian Naive Bayes classifier.
:param features: Feature matrix
:param labels: Class labels
:raises ValueError: If input sizes mismatch
>>> model = GaussianNaiveBayes()
>>> model.fit([[1.0], [2.0], [3.0]], [0, 0, 1])
"""
if len(features) != len(labels):
raise ValueError("Features and labels must have the same length")
separated: dict[int, list[list[float]]] = {}
for feature_vector, label in zip(features, labels):
separated.setdefault(label, []).append(feature_vector)
total_samples = len(labels)
for label, rows in separated.items():
self.class_priors[label] = len(rows) / total_samples
columns = list(zip(*rows))
self.means[label] = [sum(column) / len(column) for column in columns]
self.variances[label] = [
sum((feature_value - mean) ** 2 for feature_value in column)
/ len(column)
for column, mean in zip(columns, self.means[label])
]
def predict(self, features: list[list[float]]) -> list[int]:
"""
Predict class labels for input features.
:param features: Feature matrix
:return: Predicted labels
>>> model = GaussianNaiveBayes()
>>> X = [[1.0], [2.0], [3.0], [4.0]]
>>> y = [0, 0, 1, 1]
>>> model.fit(X, y)
>>> model.predict([[1.5], [3.5]])
[0, 1]
"""
predictions: list[int] = []
for row in features:
scores: list[tuple[int, float]] = []
for label in self.class_priors:
log_likelihood = math.log(self.class_priors[label])
for index, value in enumerate(row):
probability = gaussian_probability(
value,
self.means[label][index],
self.variances[label][index],
)
if probability > 0.0:
log_likelihood += math.log(probability)
scores.append((label, log_likelihood))
predictions.append(max(scores, key=lambda pair: pair[1])[0])
return predictions