Python/machine_learning/naive_bayes.py at 1526ca9c3ac9c86c31a6ca6e79184c3ff90059b7 · TheAlgorithms/Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Naive Bayes Classifier implementation.

This module implements Gaussian Naive Bayes from scratch without using
external machine learning libraries.

References:
https://en.wikipedia.org/wiki/Naive_Bayes_classifier
"""

import math


def gaussian_probability(value: float, mean: float, variance: float) -> float:
    """
    Calculate Gaussian probability density.

    >>> round(gaussian_probability(1.0, 1.0, 1.0), 3)
    0.399
    >>> gaussian_probability(1.0, 1.0, 0.0)
    0.0
    """
    if variance == 0.0:
        return 0.0

    exponent = math.exp(-((value - mean) ** 2) / (2.0 * variance))
    coefficient = 1.0 / math.sqrt(2.0 * math.pi * variance)
    return coefficient * exponent


class GaussianNaiveBayes:
    """
    Gaussian Naive Bayes classifier.
    """

    def __init__(self) -> None:
        self.class_priors: dict[int, float] = {}
        self.means: dict[int, list[float]] = {}
        self.variances: dict[int, list[float]] = {}

    def fit(self, features: list[list[float]], labels: list[int]) -> None:
        """
        Train the Gaussian Naive Bayes classifier.

        :param features: Feature matrix
        :param labels: Class labels
        :raises ValueError: If input sizes mismatch

        >>> model = GaussianNaiveBayes()
        >>> model.fit([[1.0], [2.0], [3.0]], [0, 0, 1])
        """
        if len(features) != len(labels):
            raise ValueError("Features and labels must have the same length")

        separated: dict[int, list[list[float]]] = {}
        for feature_vector, label in zip(features, labels):
            separated.setdefault(label, []).append(feature_vector)

        total_samples = len(labels)

        for label, rows in separated.items():
            self.class_priors[label] = len(rows) / total_samples

            columns = list(zip(*rows))
            self.means[label] = [sum(column) / len(column) for column in columns]
            self.variances[label] = [
                sum((feature_value - mean) ** 2 for feature_value in column)
                / len(column)
                for column, mean in zip(columns, self.means[label])
            ]

    def predict(self, features: list[list[float]]) -> list[int]:
        """
        Predict class labels for input features.

        :param features: Feature matrix
        :return: Predicted labels

        >>> model = GaussianNaiveBayes()
        >>> X = [[1.0], [2.0], [3.0], [4.0]]
        >>> y = [0, 0, 1, 1]
        >>> model.fit(X, y)
        >>> model.predict([[1.5], [3.5]])
        [0, 1]
        """
        predictions: list[int] = []

        for row in features:
            scores: list[tuple[int, float]] = []

            for label in self.class_priors:
                log_likelihood = math.log(self.class_priors[label])

                for index, value in enumerate(row):
                    probability = gaussian_probability(
                        value,
                        self.means[label][index],
                        self.variances[label][index],
                    )
                    if probability > 0.0:
                        log_likelihood += math.log(probability)

                scores.append((label, log_likelihood))

            predictions.append(max(scores, key=lambda pair: pair[1])[0])

        return predictions