diff --git a/.gitignore b/.gitignore
index 2cad792..0808c4a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,10 @@
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
-## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+## Get latest from `dotnet new gitignore`
-Simple/
+# dotenv files
+.env
# User-specific files
*.rsuser
@@ -31,7 +32,6 @@ x86/
bld/
[Bb]in/
[Oo]bj/
-[Oo]ut/
[Ll]og/
[Ll]ogs/
@@ -60,11 +60,14 @@ dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
-# .NET Core
+# .NET
project.lock.json
project.fragment.lock.json
artifacts/
+# Tye
+.tye/
+
# ASP.NET Scaffolding
ScaffoldingReadMe.txt
@@ -85,6 +88,8 @@ StyleCopReport.xml
*.pgc
*.pgd
*.rsp
+# but not Directory.Build.rsp, as it configures directory-level build defaults
+!Directory.Build.rsp
*.sbr
*.tlb
*.tli
@@ -93,6 +98,7 @@ StyleCopReport.xml
*.tmp_proj
*_wpftmp.csproj
*.log
+*.tlog
*.vspscc
*.vssscc
.builds
@@ -296,6 +302,17 @@ node_modules/
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
@@ -352,6 +369,9 @@ ASALocalRun/
# Local History for Visual Studio
.localhistory/
+# Visual Studio History (VSHistory) files
+.vshistory/
+
# BeatPulse healthcheck temp database
healthchecksdb
@@ -363,4 +383,100 @@ MigrationBackup/
# Fody - auto-generated XML schema
FodyWeavers.xsd
-/Simple/Simple.csproj
+
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+
+# Local History for Visual Studio Code
+.history/
+
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# JetBrains Rider
+*.sln.iml
+.idea/
+
+##
+## Visual studio for Mac
+##
+
+
+# globs
+Makefile.in
+*.userprefs
+*.usertasks
+config.make
+config.status
+aclocal.m4
+install-sh
+autom4te.cache/
+*.tar.gz
+tarballs/
+test-results/
+
+# content below from: https://github.com/github/gitignore/blob/main/Global/macOS.gitignore
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+# content below from: https://github.com/github/gitignore/blob/main/Global/Windows.gitignore
+# Windows thumbnail cache files
+Thumbs.db
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+# Vim temporary swap files
+*.swp
diff --git a/Directory.Build.props b/Directory.Build.props
index 623b3e5..e6f56a8 100644
--- a/Directory.Build.props
+++ b/Directory.Build.props
@@ -2,6 +2,7 @@
net10.0
enable
+ true
diff --git a/MachineLearning.Benchmarks/MachineLearning.Benchmarks.csproj b/ML.Benchy/ML.Benchy.csproj
similarity index 66%
rename from MachineLearning.Benchmarks/MachineLearning.Benchmarks.csproj
rename to ML.Benchy/ML.Benchy.csproj
index d8cd377..375fbfe 100644
--- a/MachineLearning.Benchmarks/MachineLearning.Benchmarks.csproj
+++ b/ML.Benchy/ML.Benchy.csproj
@@ -8,12 +8,11 @@
-
+
-
-
+
diff --git a/ML.Benchy/Program.cs b/ML.Benchy/Program.cs
new file mode 100644
index 0000000..032bb8d
--- /dev/null
+++ b/ML.Benchy/Program.cs
@@ -0,0 +1,44 @@
+using System.Buffers;
+using Ametrin.Numerics;
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Running;
+using ML.Core.Training;
+using Weight = float;
+
+BenchmarkRunner.Run();
+
+[MemoryDiagnoser(false)]
+public class Benchmarks
+{
+ [Params(512)]
+ public int Size { get; set; }
+ private Vector logits;
+ private Vector expected;
+ private Vector destination;
+
+ private AdamOptimizer optimizer = new() { LearningRate = 0.01f };
+
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ logits = Vector.Create(Size);
+ logits.Uniform(-1, 1, new Random(43));
+ expected = Vector.Create(Size);
+ expected.Uniform(-1, 1, new Random(68));
+ destination = Vector.Create(Size);
+ optimizer.Init();
+ }
+
+ [Benchmark]
+ public void Delegates()
+ {
+ // SpanOperations.MapTo(logits.AsSpan(), expected.AsSpan(), destination.AsSpan(), optimizer.WeightReduction, optimizer.WeightReduction);
+ }
+
+ [Benchmark]
+ public void Static()
+ {
+ // SpanOperations.MapTo(optimizer.WeightReductionOperation, logits.AsSpan(), expected.AsSpan(), destination.AsSpan());
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/Attributes.cs b/ML.Core/Attributes.cs
new file mode 100644
index 0000000..0f9de7a
--- /dev/null
+++ b/ML.Core/Attributes.cs
@@ -0,0 +1,18 @@
+namespace ML.Core.Attributes;
+
+#pragma warning disable CS9113 // Parameter is unread. only required by sourcegen
+[AttributeUsage(AttributeTargets.Property)]
+public sealed class SubModuleAttribute : Attribute;
+
+[AttributeUsage(AttributeTargets.Property)]
+public sealed class WeightsAttribute : Attribute;
+
+[AttributeUsage(AttributeTargets.Property)]
+public sealed class PropertyAttribute : Attribute;
+
+[AttributeUsage(AttributeTargets.Class)]
+public sealed class GeneratedModuleAttribute(bool IncludeSerializer = false) : Attribute;
+
+[AttributeUsage(AttributeTargets.Class)]
+public sealed class GeneratedAdamAttribute(Type module) : Attribute;
+#pragma warning restore CS9113 // Parameter is unread.
diff --git a/ML.Core/Converters/MatrixConverter.cs b/ML.Core/Converters/MatrixConverter.cs
new file mode 100644
index 0000000..0ebd23b
--- /dev/null
+++ b/ML.Core/Converters/MatrixConverter.cs
@@ -0,0 +1,32 @@
+using Ametrin.Serializer;
+
+namespace ML.Core.Converters;
+
+public sealed class MatrixConverter : ISerializationConverter
+{
+ static MatrixConverter()
+ {
+ AmetrinSerializer.RegisterSerializer();
+ }
+
+ public static Result TryReadValue(IAmetrinReader reader)
+ {
+ using var objectReader = reader.ReadStartObject();
+ var rowCount = objectReader.ReadInt32Property("RowCount");
+ objectReader.ReadPropertyName("Storage");
+ var storage = VectorConverter.ReadValue(objectReader);
+ reader.ReadEndObject();
+ Debug.Assert(storage.Count % rowCount == 0);
+ var columnCount = storage.Count / rowCount;
+ return Matrix.Of(rowCount, columnCount, storage);
+ }
+
+ public static void WriteValue(IAmetrinWriter writer, Matrix value)
+ {
+ using var objectWriter = writer.WriteStartObject();
+ objectWriter.WriteInt32Property("RowCount", value.RowCount);
+ objectWriter.WritePropertyName("Storage");
+ VectorConverter.WriteValue(objectWriter, value.Storage);
+ writer.WriteEndObject();
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/Converters/ModuleSerializer.cs b/ML.Core/Converters/ModuleSerializer.cs
new file mode 100644
index 0000000..fa9cfe4
--- /dev/null
+++ b/ML.Core/Converters/ModuleSerializer.cs
@@ -0,0 +1,49 @@
+using System.IO;
+using System.Runtime.CompilerServices;
+using Ametrin.Serializer;
+using Ametrin.Serializer.Readers;
+using Ametrin.Serializer.Writers;
+using ML.Core.Modules;
+
+namespace ML.Core.Converters;
+
+public static class ModuleSerializer
+{
+ public const string FILE_EXTENSION = ".gmw";
+ public const uint FORMAT_VERSION = 3;
+
+#pragma warning disable CA2255
+ [ModuleInitializer]
+#pragma warning restore
+ internal static void Init()
+ {
+ AmetrinSerializer.RegisterSerializer, SequenceModule>();
+ AmetrinSerializer.RegisterSerializer, SequenceModule>();
+ AmetrinSerializer.RegisterSerializer, SequenceModule>();
+ AmetrinSerializer.RegisterSerializer, EmbeddedModule>();
+ }
+
+ public static void Write(IModule module, FileInfo file)
+ {
+ using var stream = file.Create();
+ using var writer = new AmetrinBinaryWriter(stream);
+
+ writer.WriteStringProperty("$format", FILE_EXTENSION);
+ writer.WriteUInt32Property("$version", FORMAT_VERSION);
+
+ AmetrinSerializer.WriteDynamic(writer, module);
+ }
+
+ public static T Read(FileInfo file)
+ {
+ using var stream = file.OpenRead();
+ using var reader = new AmetrinBinaryReader(stream);
+
+ var format = reader.ReadStringProperty("$format");
+ if (format is not FILE_EXTENSION) throw new InvalidOperationException();
+ var version = reader.ReadUInt32Property("$version");
+ if (version is not FORMAT_VERSION) throw new InvalidOperationException();
+
+ return AmetrinSerializer.TryReadDynamic(reader).Or(e => e.Throw());
+ }
+}
diff --git a/ML.Core/Converters/VectorConverter.cs b/ML.Core/Converters/VectorConverter.cs
new file mode 100644
index 0000000..c003c61
--- /dev/null
+++ b/ML.Core/Converters/VectorConverter.cs
@@ -0,0 +1,21 @@
+using Ametrin.Serializer;
+
+namespace ML.Core.Converters;
+
+public sealed class VectorConverter : ISerializationConverter
+{
+ static VectorConverter()
+ {
+ AmetrinSerializer.RegisterSerializer();
+ }
+
+ public static Result TryReadValue(IAmetrinReader reader)
+ {
+ return reader.TryReadArrayValue(static reader => reader.TryReadSingleValue()).Map(Vector.Of);
+ }
+
+ public static void WriteValue(IAmetrinWriter writer, Vector value)
+ {
+ writer.WriteArrayValue(value.AsSpan(), static (writer, v) => writer.WriteSingleValue(v));
+ }
+}
\ No newline at end of file
diff --git a/MachineLearning.Data/ITokenizer.cs b/ML.Core/Data/ITokenizer.cs
similarity index 69%
rename from MachineLearning.Data/ITokenizer.cs
rename to ML.Core/Data/ITokenizer.cs
index ca2cfcb..702f96b 100644
--- a/MachineLearning.Data/ITokenizer.cs
+++ b/ML.Core/Data/ITokenizer.cs
@@ -1,4 +1,4 @@
-namespace MachineLearning.Data;
+namespace ML.Core.Data;
public interface ITokenizer
{
@@ -6,6 +6,5 @@ public interface ITokenizer
public IEnumerable Tokenize(TData data);
public int TokenizeSingle(TData data);
public TData GetToken(int data);
- public string Decode(IEnumerable tokens);
-
-}
+ public TData Decode(IEnumerable tokens);
+}
\ No newline at end of file
diff --git a/ML.Core/Data/Noise/IDataNoise.cs b/ML.Core/Data/Noise/IDataNoise.cs
new file mode 100644
index 0000000..f20311c
--- /dev/null
+++ b/ML.Core/Data/Noise/IDataNoise.cs
@@ -0,0 +1,13 @@
+namespace ML.Core.Data.Noise;
+
+public interface IDataNoise
+{
+ public TData Apply(TData data);
+}
+
+
+public sealed class NoDataNoise : IDataNoise
+{
+ public static NoDataNoise Instance => field ??= new();
+ public TData Apply(TData data) => data;
+}
diff --git a/MachineLearning.Data/Noise/ImageInputNoise.cs b/ML.Core/Data/Noise/ImageNoise.cs
similarity index 81%
rename from MachineLearning.Data/Noise/ImageInputNoise.cs
rename to ML.Core/Data/Noise/ImageNoise.cs
index 790b9b9..1a86066 100644
--- a/MachineLearning.Data/Noise/ImageInputNoise.cs
+++ b/ML.Core/Data/Noise/ImageNoise.cs
@@ -1,12 +1,12 @@
using Ametrin.Utils.Transformation;
-namespace MachineLearning.Data.Noise;
+namespace ML.Core.Data.Noise;
-public sealed class ImageInputNoise : IInputDataNoise
+public sealed class ImageNoise : IDataNoise
{
public required int Size { get; init; }
public double NoiseStrength { get; init; } = 0;
- public double NoiseProbability { get; init; } = 0;
+ // public double NoiseProbability { get; init; } = 0;
public int MaxShift { get; init; } = 0;
public double MaxAngle { get; init; } = 0;
public double MinScale { get; init; } = 1;
@@ -23,10 +23,10 @@ public double[] Apply(double[] data)
};
var transformed = transform.ApplySmooth(data, Size);
- foreach(var i in ..transformed.Length)
+ foreach (var i in ..transformed.Length)
{
transformed[i] += (Random.NextDouble() - 0.5) * 2 * NoiseStrength;
}
return transformed;
}
-}
+}
\ No newline at end of file
diff --git a/ML.Core/Data/Training/BatchHelper.cs b/ML.Core/Data/Training/BatchHelper.cs
new file mode 100644
index 0000000..67b6d5f
--- /dev/null
+++ b/ML.Core/Data/Training/BatchHelper.cs
@@ -0,0 +1,13 @@
+namespace ML.Core.Data.Training;
+
+public static class BatchHelper
+{
+ public static IEnumerable Create(IEnumerable source, int startIndex, int batchSize)
+ => Create(source.Skip(startIndex), batchSize);
+
+ public static IEnumerable Create(IEnumerable source, int batchSize)
+ => source.Take(batchSize);
+
+ public static IEnumerable CreateRandom(ICollection source, int batchSize, Random? random = null)
+ => source.GetRandomElements(batchSize, random);
+}
\ No newline at end of file
diff --git a/ML.Core/Data/Training/ITrainingDataSource.cs b/ML.Core/Data/Training/ITrainingDataSource.cs
new file mode 100644
index 0000000..066fd70
--- /dev/null
+++ b/ML.Core/Data/Training/ITrainingDataSource.cs
@@ -0,0 +1,37 @@
+namespace ML.Core.Data.Training;
+
+public interface ITrainingDataSource
+{
+ public int BatchCount { get; }
+ public int BatchSize { get; }
+ public IEnumerable> GetBatches();
+ public void Reset();
+}
+
+
+public sealed class TrainingDataSource(IEnumerable data) : ITrainingDataSource
+{
+ public bool ShuffleOnReset { get; init; } = true;
+ public Random Random { get; init; } = Random.Shared;
+ public required int BatchCount { get; init; }
+ public int BatchSize => data.Length / BatchCount;
+
+ private readonly T[] data = [.. data];
+
+ public IEnumerable> GetBatches()
+ {
+ var batchSize = BatchSize;
+ foreach (var i in ..BatchCount)
+ {
+ yield return BatchHelper.Create(data, i * batchSize, batchSize);
+ }
+ }
+
+ public void Reset()
+ {
+ if (ShuffleOnReset)
+ {
+ Random.Shuffle(data);
+ }
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/Data/Training/TrainingEntry.cs b/ML.Core/Data/Training/TrainingEntry.cs
new file mode 100644
index 0000000..33ebf47
--- /dev/null
+++ b/ML.Core/Data/Training/TrainingEntry.cs
@@ -0,0 +1,3 @@
+namespace ML.Core.Data.Training;
+
+public sealed record TrainingEntry(TInput InputValue, TArch ExpectedWeights, TExpected ExpectedValue);
diff --git a/ML.Core/Evaluation/Cost/BinaryCrossEntropyCost.cs b/ML.Core/Evaluation/Cost/BinaryCrossEntropyCost.cs
new file mode 100644
index 0000000..c758ae8
--- /dev/null
+++ b/ML.Core/Evaluation/Cost/BinaryCrossEntropyCost.cs
@@ -0,0 +1,26 @@
+namespace ML.Core.Evaluation.Cost;
+
+///
+/// Binary-Cross-Entropy Cost Function
+/// classification tasks, particularly binary
+/// requires outputs in range 0..1
+/// Cons: Numerically unstable (e.g., log(0) issues), this impl clamps to
+///
+// TODO: FromLogits version see CrossEntropyCostFromLogits but with Sigmoid
+public sealed class BinaryCrossEntropyCost : ICostFunction
+{
+ public static BinaryCrossEntropyCost Instance => field ??= new();
+ public const Weight EPSILON = 1e-7f;
+
+ public Weight Cost(Weight output, Weight expected)
+ {
+ output = Weight.Clamp(output, EPSILON, 1 - EPSILON); // just return 0 or 1?
+ return -(expected * Weight.Log(output) + (1 - expected) * Weight.Log(1 - output));
+ }
+
+ public Weight Derivative(Weight output, Weight expected)
+ {
+ output = Weight.Clamp(output, EPSILON, 1 - EPSILON); // just return 0 or 1?
+ return (output - expected) / (output * (1 - output));
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/Evaluation/Cost/CrossEntropyCostFromLogits.cs b/ML.Core/Evaluation/Cost/CrossEntropyCostFromLogits.cs
new file mode 100644
index 0000000..8822aad
--- /dev/null
+++ b/ML.Core/Evaluation/Cost/CrossEntropyCostFromLogits.cs
@@ -0,0 +1,39 @@
+using System.Buffers;
+
+namespace ML.Core.Evaluation.Cost;
+
+///
+/// Cross-Entropy Cost Function using SoftMax
+/// requires a linear output
+/// requires expected.Sum() == 1
+/// parts of softmax and cross entropy cancel out in the backwards pass reducing operations, also stabilizes gradients because less divisions
+///
+public sealed class CrossEntropyCostFromLogits : ICostFunction
+{
+ public static readonly CrossEntropyCostFromLogits Instance = new();
+
+ public Weight TotalCost(Vector logits, Vector expected)
+ {
+ NumericsDebug.AssertSameDimensions(logits, expected);
+
+ using var destinationStorage = ArrayPool.Shared.RentNumerics(logits.FlatCount);
+ var destination = Vector.OfSize(logits, destinationStorage);
+
+ var maxLogit = logits.Max();
+ logits.SubtractPointwiseTo(maxLogit, destination);
+ destination.PointwiseExpToSelf();
+ var expSum = destination.Sum();
+
+ var logSumExp = maxLogit + Weight.Log(expSum);
+ var expectedDotLogits = expected.Dot(logits);
+ return -expectedDotLogits + logSumExp;
+ }
+
+ public void DerivativeTo(Vector logits, Vector expected, Vector destination)
+ {
+ NumericsDebug.AssertSameDimensions(logits, expected, destination);
+
+ logits.SoftMaxTo(destination);
+ destination.SubtractToSelf(expected);
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/Evaluation/Cost/CrossEntropyCostFromProbabilities.cs b/ML.Core/Evaluation/Cost/CrossEntropyCostFromProbabilities.cs
new file mode 100644
index 0000000..97f35d9
--- /dev/null
+++ b/ML.Core/Evaluation/Cost/CrossEntropyCostFromProbabilities.cs
@@ -0,0 +1,25 @@
+namespace ML.Core.Evaluation.Cost;
+
+///
+/// Cross-Entropy Cost Function
+/// requires outputs in range 0..1
+/// prefer
+/// Cons: Numerically unstable (e.g., log(0) issues), this impl clamps to
+///
+public sealed class CrossEntropyCostFromProbabilities : ICostFunction
+{
+ public static CrossEntropyCostFromProbabilities Instance => field ??= new();
+ const Weight EPSILON = 1e-7f;
+
+ public Weight Cost(Weight output, Weight expected)
+ {
+ output = Weight.Clamp(output, EPSILON, 1 - EPSILON);
+ return -expected * Weight.Log(output);
+ }
+
+ public Weight Derivative(Weight output, Weight expected)
+ {
+ return -expected / Weight.Clamp(output, EPSILON, 1 - EPSILON);
+ // return output - expected;
+ }
+}
diff --git a/ML.Core/Evaluation/Cost/ICostFunction.cs b/ML.Core/Evaluation/Cost/ICostFunction.cs
new file mode 100644
index 0000000..d1e0891
--- /dev/null
+++ b/ML.Core/Evaluation/Cost/ICostFunction.cs
@@ -0,0 +1,44 @@
+namespace ML.Core.Evaluation.Cost;
+
+public interface ICostFunction
+ where TArch : ITensorLike
+{
+ public Weight TotalCost(TArch output, TArch expected);
+ public void DerivativeTo(TArch output, TArch expected, TArch destination);
+
+ public TArch Derivative(TArch output, TArch expected)
+ {
+ var result = TArch.OfSize(output);
+ DerivativeTo(output, expected, result);
+ NumericsDebug.AssertValidNumbers(result.AsSpan());
+ return result;
+ }
+}
+
+public interface ICostFunction : ICostFunction
+{
+ internal Weight Cost(Weight output, Weight expected);
+ Weight ICostFunction.TotalCost(Vector output, Vector expected)
+ {
+ NumericsDebug.AssertSameDimensions(output, expected);
+ var totalCost = 0.0f;
+
+ foreach (var i in ..output.Count)
+ {
+ totalCost += Cost(output[i], expected[i]);
+ }
+
+ return totalCost;
+ }
+
+ internal Weight Derivative(Weight output, Weight expected);
+
+ void ICostFunction.DerivativeTo(Vector output, Vector expected, Vector destination)
+ {
+ NumericsDebug.AssertSameDimensions(output, expected, destination);
+ for (int i = 0; i < destination.Count; i++)
+ {
+ destination[i] = Derivative(output[i], expected[i]);
+ }
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/Evaluation/EvaluationResult.cs b/ML.Core/Evaluation/EvaluationResult.cs
new file mode 100644
index 0000000..d96283e
--- /dev/null
+++ b/ML.Core/Evaluation/EvaluationResult.cs
@@ -0,0 +1,44 @@
+namespace ML.Core.Evaluation;
+
+public sealed class EvaluationResult
+{
+ public static readonly EvaluationResult ZERO = new() { TotalCount = 0, CorrectCount = 0, CorrectConfidenceSum = 0, WrongConfidenceSum = 0, TotalCost = 0, TotalElapsedTime = TimeSpan.Zero, stackCount = 0 };
+ public required int TotalCount { get; init; }
+ public int AverageCount => TotalCount / stackCount;
+ public required int CorrectCount { get; init; }
+ public float CorrectPercentage => (float)CorrectCount / TotalCount;
+ public int WrongCount => TotalCount - CorrectCount;
+ public float WrongPercentage => (float)WrongCount / TotalCount;
+
+ public required float CorrectConfidenceSum { get; init; }
+ public float CorrectConfidence => CorrectConfidenceSum / CorrectCount;
+
+ public required float WrongConfidenceSum { get; init; }
+ public float WrongConfidence => WrongConfidenceSum / WrongCount;
+
+ public required double TotalCost { get; init; }
+ public double AverageCost => TotalCost / TotalCount;
+
+ public TimeSpan TotalElapsedTime { get; init; } = TimeSpan.Zero;
+ public TimeSpan AverageElapsedTime => TotalElapsedTime / stackCount;
+ private int stackCount = 1;
+
+ public static EvaluationResult operator +(EvaluationResult left, EvaluationResult right) => new()
+ {
+ TotalCount = left.TotalCount + right.TotalCount,
+ CorrectCount = left.CorrectCount + right.CorrectCount,
+ CorrectConfidenceSum = left.CorrectConfidenceSum + right.CorrectConfidenceSum,
+ WrongConfidenceSum = left.WrongConfidenceSum + right.WrongConfidenceSum,
+ TotalCost = left.TotalCost + right.TotalCost,
+ TotalElapsedTime = left.TotalElapsedTime + right.TotalElapsedTime,
+ stackCount = left.stackCount + right.stackCount
+ };
+
+ public override string ToString() => $"{CorrectPercentage * 100,5:F1}% | {CorrectConfidence:F2} {WrongConfidence:F2}";
+ public string ToColoredString() => $"{ConfidenceToTextColor(CorrectPercentage)}{CorrectPercentage * 100,5:F1}%{RESET_COLOR} | {CorrectConfidence:F2} {WrongConfidence:F2}";
+
+ public static string GetHeader() => " ✅ | Conf. | Cost";
+
+ const string RESET_COLOR = "\u001b[0m";
+ static string ConfidenceToTextColor(Weight confidence) => $"\u001b[38;2;{(1 - confidence) * 255:F0};{confidence * 255:F0};60m";
+}
diff --git a/MachineLearning.Training/Evaluation/TrainingEvaluationContext.cs b/ML.Core/Evaluation/TrainingEvaluationContext.cs
similarity index 56%
rename from MachineLearning.Training/Evaluation/TrainingEvaluationContext.cs
rename to ML.Core/Evaluation/TrainingEvaluationContext.cs
index e59d65c..1f10f4c 100644
--- a/MachineLearning.Training/Evaluation/TrainingEvaluationContext.cs
+++ b/ML.Core/Evaluation/TrainingEvaluationContext.cs
@@ -1,4 +1,4 @@
-namespace MachineLearning.Training.Evaluation;
+namespace ML.Core.Evaluation;
public sealed class TrainingEvaluationContext
{
@@ -6,6 +6,6 @@ public sealed class TrainingEvaluationContext
public required int MaxEpoch { get; init; }
public required int CurrentBatch { get; init; }
public required int MaxBatch { get; init; }
- public required double LearnRate { get; init; }
- public string Dump() => $"epoch {CurrentEpoch}/{MaxEpoch}\tbatch {CurrentBatch}/{MaxBatch}";
+ public required double LearningRate { get; init; }
+ public override string ToString() => $"{CurrentEpoch,2}/{MaxEpoch,-2} {CurrentBatch,4}/{MaxBatch,-4}";
}
diff --git a/ML.Core/Evaluation/TrainingEvaluationResult.cs b/ML.Core/Evaluation/TrainingEvaluationResult.cs
new file mode 100644
index 0000000..f9496f0
--- /dev/null
+++ b/ML.Core/Evaluation/TrainingEvaluationResult.cs
@@ -0,0 +1,12 @@
+namespace ML.Core.Evaluation;
+
+public sealed class TrainingEvaluationResult
+{
+ public required TrainingEvaluationContext Context { get; init; }
+ public required EvaluationResult Result { get; init; }
+ public TimeSpan Duration { get; init; }
+ public override string ToString() => $"{Result.ToColoredString()} | {Result.AverageCost:F4} | {Result.TotalElapsedTime:ss\\.ff}s ({Result.AverageElapsedTime:ss\\.ff}s) | {Context} | {Result.AverageCount}";
+
+ // Emoji helps quickly finding the start of the current training run
+ public static string GetHeader() => $"{EvaluationResult.GetHeader()} | Time (/batch) | epoch batch | entries";
+}
diff --git a/ML.Core/ML.Core.csproj b/ML.Core/ML.Core.csproj
new file mode 100644
index 0000000..b6abd76
--- /dev/null
+++ b/ML.Core/ML.Core.csproj
@@ -0,0 +1,16 @@
+
+
+
+ $(DotNetVersion)
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/ML.Core/Modules/Activations/IActivationModule.cs b/ML.Core/Modules/Activations/IActivationModule.cs
new file mode 100644
index 0000000..309085f
--- /dev/null
+++ b/ML.Core/Modules/Activations/IActivationModule.cs
@@ -0,0 +1,5 @@
+namespace ML.Core.Modules.Activations;
+
+public interface IActivationModule : IModule;
+public interface IActivationModule : IActivationModule, IHiddenModule;
+public interface IActivationModule : IActivationModule, IHiddenModule where TSnapshot : IModuleSnapshot;
diff --git a/ML.Core/Modules/Activations/LeakyReLUActivation.cs b/ML.Core/Modules/Activations/LeakyReLUActivation.cs
new file mode 100644
index 0000000..5f1ecfa
--- /dev/null
+++ b/ML.Core/Modules/Activations/LeakyReLUActivation.cs
@@ -0,0 +1,75 @@
+using ML.Core.Attributes;
+
+namespace ML.Core.Modules.Activations;
+
+[GeneratedModule(IncludeSerializer: true)]
+public sealed partial class LeakyReLUActivation(Weight alpha = 0.01f) : IActivationModule
+{
+ public static LeakyReLUActivation Instance => field ??= new();
+
+ public Weight Alpha { get; } = alpha;
+ private readonly LeakyReLUOperation forwardOp = new(alpha);
+ private readonly LeakyReLUDerivativeOperation derivativeOp = new(alpha);
+
+ public Vector Forward(Vector input, Snapshot snapshot)
+ {
+ snapshot.Input = input;
+ snapshot.Input.MapTo(forwardOp, snapshot.Output);
+ return snapshot.Output;
+ }
+
+ public Vector Backward(Vector outputGradient, Snapshot snapshot, EmptyModuleData gradients)
+ {
+ snapshot.Input.MapTo(derivativeOp, snapshot.InputGradient);
+ snapshot.InputGradient.PointwiseMultiplyToSelf(outputGradient);
+ NumericsDebug.AssertValidNumbers(snapshot.InputGradient);
+ return snapshot.InputGradient;
+ }
+
+ public sealed class Snapshot() : IModuleSnapshot
+ {
+ public Vector Input
+ {
+ get;
+ set
+ {
+ field = value;
+ outputHandle.SetCount(field.Count);
+ inputGradientHandle.SetCount(field.Count);
+ }
+ }
+ public Vector Output => outputHandle.Vector;
+ public Vector InputGradient => inputGradientHandle.Vector;
+
+ private DynamicVector outputHandle = new();
+ private DynamicVector inputGradientHandle = new();
+
+ internal Snapshot(LeakyReLUActivation _) : this() { }
+
+ public void Dispose()
+ {
+ outputHandle.Dispose();
+ inputGradientHandle.Dispose();
+ }
+ }
+
+ public readonly struct LeakyReLUOperation(Weight alpha) : IUnaryOperator
+ {
+ private readonly Weight alpha = alpha;
+ // constructing an alpha vector once and reusing seems to be slower
+
+ public static Weight Invoke(in LeakyReLUOperation info, Weight input) => input > 0 ? input : info.alpha * input;
+ public static SimdVector Invoke(in LeakyReLUOperation info, SimdVector input)
+ => SimdVectorHelper.ConditionalSelect(SimdVectorHelper.GreaterThan(input, SimdVector.Zero), input, input * info.alpha);
+ }
+
+ private readonly struct LeakyReLUDerivativeOperation(Weight alpha) : IUnaryOperator
+ {
+ private readonly Weight alpha = alpha;
+ // constructing an alpha vector once and reusing seems to be slower
+
+ public static Weight Invoke(in LeakyReLUDerivativeOperation info, Weight input) => input > 0 ? 1 : info.alpha;
+ public static SimdVector Invoke(in LeakyReLUDerivativeOperation info, SimdVector input)
+ => SimdVectorHelper.ConditionalSelect(SimdVectorHelper.GreaterThan(input, SimdVector.Zero), SimdVector.One, SimdVectorHelper.Create(info.alpha));
+ }
+}
diff --git a/ML.Core/Modules/Activations/SoftMaxActivation.cs b/ML.Core/Modules/Activations/SoftMaxActivation.cs
new file mode 100644
index 0000000..925952f
--- /dev/null
+++ b/ML.Core/Modules/Activations/SoftMaxActivation.cs
@@ -0,0 +1,51 @@
+using ML.Core.Attributes;
+
+namespace ML.Core.Modules.Activations;
+
+[GeneratedModule(IncludeSerializer: true)]
+public sealed partial class SoftMaxActivation : IActivationModule
+{
+ public static SoftMaxActivation Instance => field ??= new();
+ public Vector Forward(Vector input, Snapshot snapshot)
+ {
+ snapshot.Input = input;
+ snapshot.Input.SoftMaxTo(snapshot.Output);
+ return snapshot.Output;
+ }
+
+ public Vector Backward(Vector outputGradient, Snapshot snapshot, EmptyModuleData gradients)
+ {
+ var dot = snapshot.Output.Dot(outputGradient);
+ outputGradient.SubtractPointwiseTo(dot, snapshot.InputGradient);
+ snapshot.InputGradient.PointwiseMultiplyToSelf(snapshot.Output);
+ NumericsDebug.AssertValidNumbers(snapshot.InputGradient);
+ return snapshot.InputGradient;
+ }
+
+ public sealed class Snapshot() : IModuleSnapshot
+ {
+ public Vector Input
+ {
+ get;
+ set
+ {
+ field = value;
+ outputHandle.SetCount(field.Count);
+ inputGradientHandle.SetCount(field.Count);
+ }
+ }
+ public Vector Output => outputHandle.Vector;
+ public Vector InputGradient => inputGradientHandle.Vector;
+
+ private DynamicVector outputHandle = new();
+ private DynamicVector inputGradientHandle = new();
+
+ internal Snapshot(SoftMaxActivation _) : this() { }
+
+ public void Dispose()
+ {
+ outputHandle.Dispose();
+ inputGradientHandle.Dispose();
+ }
+ }
+}
diff --git a/ML.Core/Modules/Builder/MultiLayerPerceptronBuilder.cs b/ML.Core/Modules/Builder/MultiLayerPerceptronBuilder.cs
new file mode 100644
index 0000000..23eb82a
--- /dev/null
+++ b/ML.Core/Modules/Builder/MultiLayerPerceptronBuilder.cs
@@ -0,0 +1,49 @@
+using ML.Core.Modules.Activations;
+using ML.Core.Modules.Initialization;
+
+namespace ML.Core.Modules.Builder;
+
+public sealed class MultiLayerPerceptronBuilder
+{
+ private readonly List<(int input, int output, IActivationModule activation)> layers = [];
+ private int nextInput;
+ public static MultiLayerPerceptronBuilder Create(int inputNodes) => new() { nextInput = inputNodes };
+
+ public MultiLayerPerceptronBuilder AddLayer(int outputNodes, IActivationModule activation)
+ {
+ layers.Add((nextInput, outputNodes, activation));
+ nextInput = outputNodes;
+ return this;
+ }
+
+ public MultiLayerPerceptronBuilder AddLayer(int outputNodes, Func> activation)
+ {
+ layers.Add((nextInput, outputNodes, activation.Invoke(nextInput, outputNodes)));
+ nextInput = outputNodes;
+ return this;
+ }
+
+ public SequenceModule Build() => new()
+ {
+ Inner = [.. layers.Select(d => new PerceptronModule(d.input, d.output) { Activation = d.activation })],
+ };
+
+ public SequenceModule BuildAndInit(Random random)
+ {
+ var module = Build();
+
+ var initializer = new SequenceModule.Initializer
+ {
+ Inner = [.. module.Inner.Cast().Select(inner => (IModuleInitializer)(inner.Activation switch
+ {
+ SoftMaxActivation or EmptyModule => new PerceptronModule.XavierInitializer() { Random = random },
+ LeakyReLUActivation => new PerceptronModule.KaimingInitializer(inner.Activation) { Random = random },
+ _ => throw new NotImplementedException(),
+ }))],
+ };
+
+ initializer.Init(module);
+
+ return module;
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/Modules/EmbeddedModule.cs b/ML.Core/Modules/EmbeddedModule.cs
new file mode 100644
index 0000000..530f381
--- /dev/null
+++ b/ML.Core/Modules/EmbeddedModule.cs
@@ -0,0 +1,88 @@
+using System.Diagnostics.CodeAnalysis;
+using Ametrin.Serializer;
+using ML.Core.Attributes;
+using ML.Core.Modules.Initialization;
+using ML.Core.Training;
+
+namespace ML.Core.Modules;
+
+[GeneratedModule(IncludeSerializer: true)]
+public sealed partial class EmbeddedModule : IModule, IEmbeddedModule
+{
+ [SubModule] public required IInputModule Input { get; init; }
+ [SubModule] public required IHiddenModule Hidden { get; init; }
+ [SubModule] public required IOutputModule Output { get; init; }
+
+ public EmbeddedModule() { }
+
+ [SetsRequiredMembers]
+ public EmbeddedModule(IInputModule input, IHiddenModule hidden, IOutputModule output)
+ {
+ Input = input;
+ Hidden = hidden;
+ Output = output;
+ }
+
+ public (TOut Output, Weight Confidence, TArch Weights) Forward(TIn input, Snapshot snapshot)
+ {
+ return Output.Forward(Hidden.Forward(Input.Forward(input, snapshot.Input), snapshot.Hidden), snapshot.Output);
+ }
+
+ public TArch Backward(TArch outputGradient, Snapshot snapshot, Gradients gradients)
+ {
+ return Input.Backward(Hidden.Backward(Output.Backward(outputGradient, snapshot.Output, gradients.Output), snapshot.Hidden, gradients.Hidden), snapshot.Input, gradients.Input);
+ }
+
+ (TOut Output, float Confidence) IEmbeddedModule.Forward(TIn input, IModuleSnapshot snapshot)
+ {
+ var (output, confidence, _) = Forward(input, (Snapshot)snapshot);
+ return (output, confidence);
+ }
+
+ static EmbeddedModule()
+ {
+ AdamOptimizer.Registry.Register>(static (o, module) => new Adam(o, module));
+ }
+
+ public sealed class Adam(AdamOptimizer optimizer, EmbeddedModule module) : IModuleOptimizer
+ {
+ public IModuleOptimizer Input { get; } = optimizer.CreateModuleOptimizer(module.Input);
+ public IModuleOptimizer Hidden { get; } = optimizer.CreateModuleOptimizer(module.Hidden);
+ public IModuleOptimizer Output { get; } = optimizer.CreateModuleOptimizer(module.Output);
+
+ public void Apply(Gradients gradients)
+ {
+ Input.Apply(gradients.Input);
+ Hidden.Apply(gradients.Hidden);
+ Output.Apply(gradients.Output);
+ }
+
+ public void FullReset()
+ {
+ Input.FullReset();
+ Hidden.FullReset();
+ Output.FullReset();
+ }
+ }
+
+ public sealed class Initializer : IModuleInitializer>
+ {
+ public IModuleInitializer Input { get; init; } = EmptyModuleInitializer.Instance;
+ public IModuleInitializer Hidden { get; init; } = EmptyModuleInitializer.Instance;
+ public IModuleInitializer Output { get; init; } = EmptyModuleInitializer.Instance;
+
+ public EmbeddedModule Init(EmbeddedModule module)
+ {
+ Input.Init(module.Input);
+ Hidden.Init(module.Hidden);
+ Output.Init(module.Output);
+
+ return module;
+ }
+ }
+}
+
+public interface IEmbeddedModule : IModule
+{
+ public (TOut Output, Weight Confidence) Forward(TIn input, IModuleSnapshot snapshot);
+}
\ No newline at end of file
diff --git a/ML.Core/Modules/EmptyModule.cs b/ML.Core/Modules/EmptyModule.cs
new file mode 100644
index 0000000..3af395e
--- /dev/null
+++ b/ML.Core/Modules/EmptyModule.cs
@@ -0,0 +1,12 @@
+using ML.Core.Attributes;
+using ML.Core.Modules.Activations;
+
+namespace ML.Core.Modules;
+
+[GeneratedModule(IncludeSerializer: true)]
+public sealed partial class EmptyModule : IActivationModule
+{
+ public static EmptyModule Instance => field ??= new();
+ public Vector Forward(Vector input, EmptyModuleData snapshot) => input;
+ public Vector Backward(Vector outputGradient, EmptyModuleData snapshot, EmptyModuleData gradients) => outputGradient;
+}
\ No newline at end of file
diff --git a/ML.Core/Modules/IHiddenModule.cs b/ML.Core/Modules/IHiddenModule.cs
new file mode 100644
index 0000000..2587c20
--- /dev/null
+++ b/ML.Core/Modules/IHiddenModule.cs
@@ -0,0 +1,12 @@
+namespace ML.Core.Modules;
+
+public interface IHiddenModule : IInputModule;
+
+public interface IHiddenModule : IHiddenModule, IModule
+ where TSnapshot : IModuleSnapshot
+ where TGradients : IModuleGradients
+{
+ public TArch Forward(TArch input, TSnapshot snapshot);
+ TArch IInputModule.Forward(TArch input, IModuleSnapshot snapshot)
+ => Forward(input, Guard.Is(snapshot));
+}
\ No newline at end of file
diff --git a/ML.Core/Modules/IInputModule.cs b/ML.Core/Modules/IInputModule.cs
new file mode 100644
index 0000000..619c595
--- /dev/null
+++ b/ML.Core/Modules/IInputModule.cs
@@ -0,0 +1,15 @@
+namespace ML.Core.Modules;
+
+public interface IInputModule : IModule
+{
+ public TArch Forward(TIn input, IModuleSnapshot snapshot);
+}
+
+public interface IInputModule : IInputModule, IModule
+ where TSnapshot : IModuleSnapshot
+ where TGradients : IModuleGradients
+{
+ public TArch Forward(TIn input, TSnapshot snapshot);
+ TArch IInputModule.Forward(TIn input, IModuleSnapshot snapshot)
+ => Forward(input, Guard.Is(snapshot));
+}
diff --git a/ML.Core/Modules/IModule.cs b/ML.Core/Modules/IModule.cs
new file mode 100644
index 0000000..7bd6fbb
--- /dev/null
+++ b/ML.Core/Modules/IModule.cs
@@ -0,0 +1,53 @@
+namespace ML.Core.Modules;
+
+public interface IModule
+{
+ public ulong ParameterCount { get; }
+
+ public IModuleSnapshot CreateSnapshot();
+ public IModuleGradients CreateGradients();
+}
+
+public interface IModule : IModule
+{
+ public TArch Backward(TArch outputGradient, IModuleSnapshot snapshot, IModuleGradients gradients);
+}
+
+public interface IModule : IModule
+ where TSnapshot : IModuleSnapshot
+ where TGradients : IModuleGradients
+{
+ public TArch Backward(TArch outputGradient, TSnapshot snapshot, TGradients gradients);
+ TArch IModule.Backward(TArch outputGradient, IModuleSnapshot snapshot, IModuleGradients gradients)
+ => Backward(outputGradient, Guard.Is(snapshot), Guard.Is(gradients));
+
+ public new TSnapshot CreateSnapshot();
+ public new TGradients CreateGradients();
+
+ IModuleSnapshot IModule.CreateSnapshot() => CreateSnapshot();
+ IModuleGradients IModule.CreateGradients() => CreateGradients();
+}
+
+public interface IModuleSnapshot : IDisposable;
+
+public interface IModuleGradients
+{
+ public void Add(IModuleGradients other);
+ public void Reset();
+}
+
+public interface IModuleGradients : IModuleGradients where TSelf : IModuleGradients
+{
+ public void Add(TSelf other);
+ void IModuleGradients.Add(IModuleGradients other) => Add(Guard.Is(other));
+}
+
+public sealed class EmptyModuleData() : IModuleGradients, IModuleSnapshot
+{
+ public static EmptyModuleData Instance => field ??= new();
+ public EmptyModuleData(object? _) : this() { }
+
+ public void Add(EmptyModuleData other) { }
+ public void Reset() { }
+ public void Dispose() { }
+}
\ No newline at end of file
diff --git a/ML.Core/Modules/IOutputModule.cs b/ML.Core/Modules/IOutputModule.cs
new file mode 100644
index 0000000..dbe9389
--- /dev/null
+++ b/ML.Core/Modules/IOutputModule.cs
@@ -0,0 +1,15 @@
+namespace ML.Core.Modules;
+
+public interface IOutputModule : IModule
+{
+ public (TOut Output, Weight Confidence, TArch Weights) Forward(TArch input, IModuleSnapshot snapshot);
+}
+
+public interface IOutputModule : IOutputModule, IModule
+ where TSnapshot : IModuleSnapshot
+ where TGradients : IModuleGradients
+{
+ public (TOut Output, Weight Confidence, TArch Weights) Forward(TArch input, TSnapshot snapshot);
+ (TOut Output, Weight Confidence, TArch Weights) IOutputModule.Forward(TArch input, IModuleSnapshot snapshot)
+ => Forward(input, Guard.Is(snapshot));
+}
diff --git a/ML.Core/Modules/IndexEmbeddingModule.cs b/ML.Core/Modules/IndexEmbeddingModule.cs
new file mode 100644
index 0000000..33e442b
--- /dev/null
+++ b/ML.Core/Modules/IndexEmbeddingModule.cs
@@ -0,0 +1,118 @@
+using System.Numerics.Tensors;
+using ML.Core.Attributes;
+using ML.Core.Training;
+
+namespace ML.Core.Modules;
+
+[GeneratedModule(IncludeSerializer: true)]
+public sealed partial class IndexEmbeddingModule(Matrix embeddingMatrix) : IInputModule
+{
+ [Weights] public Matrix EmbeddingMatrix { get; } = embeddingMatrix;
+
+ public int TokenCount => EmbeddingMatrix.RowCount;
+ public int EmbeddingSize => EmbeddingMatrix.ColumnCount;
+
+ public IndexEmbeddingModule(int tokenCount, int embeddingSize)
+ : this(Matrix.Create(tokenCount, embeddingSize)) { }
+
+ public Matrix Forward(int[] input, Snapshot snapshot)
+ {
+ snapshot.Input = input;
+
+ foreach (var i in ..input.Length)
+ {
+ GetEmbedding(input[i]).CopyTo(snapshot.Output.RowSpan(i));
+ }
+
+ return snapshot.Output;
+ }
+
+ public Matrix Backward(Matrix outputGradients, Snapshot snapshot, Gradients gradients)
+ {
+ foreach (var i in ..snapshot.Input.Length)
+ {
+ var token = snapshot.Input[i];
+ gradients.TouchedTokens.Add(token);
+ var embeddingGradient = gradients.EmbeddingMatrix.RowSpan(token);
+ TensorPrimitives.Add(embeddingGradient, outputGradients.RowSpan(i), embeddingGradient);
+ }
+
+ return Matrix.Empty;
+ }
+
+ private Span GetEmbedding(int index)
+ {
+ if (index < 0 || index >= EmbeddingMatrix.RowCount)
+ {
+ throw new ArgumentException($"Unknown token: {index}");
+ }
+
+ return EmbeddingMatrix.RowSpan(index);
+ }
+
+ static IndexEmbeddingModule()
+ {
+ AdamOptimizer.Registry.Register(static (op, module) => new Adam(op, module));
+ }
+
+
+ partial class Snapshot
+ {
+ public int[] Input
+ {
+ get;
+ set
+ {
+ field = value;
+ OutputStorage.SetCount(field.Length * module.EmbeddingSize);
+ Output = Matrix.Of(field.Length, module.EmbeddingSize, OutputStorage.Vector);
+ }
+ } = [];
+
+ // TODO: dispose hook to set Output to Empty
+ public Matrix Output { get; private set; }
+
+ private DynamicVector OutputStorage { get; } = new();
+ }
+
+ partial class Gradients
+ {
+ // TODO: clear in reset (remove clear call from Adam.Apply)
+ public HashSet TouchedTokens { get; } = [];
+ }
+
+ public partial class Adam(AdamOptimizer optimizer, IndexEmbeddingModule module) : IModuleOptimizer
+ {
+ public IndexEmbeddingModule Module { get; } = module;
+ public AdamOptimizer Optimizer { get; } = optimizer;
+
+ public Matrix FirstMomentEmbeddingMatrix { get; } = Matrix.OfSize(module.EmbeddingMatrix);
+ public Matrix SecondMomentEmbeddingMatrix { get; } = Matrix.OfSize(module.EmbeddingMatrix);
+
+ public void Apply(Gradients gradients)
+ {
+ foreach (var token in gradients.TouchedTokens)
+ {
+ var gradient = gradients.EmbeddingMatrix.RowSpan(token);
+ var firstMoment = FirstMomentEmbeddingMatrix.RowSpan(token);
+ var secondMoment = SecondMomentEmbeddingMatrix.RowSpan(token);
+ var weights = Module.EmbeddingMatrix.RowSpan(token);
+
+ SpanOperations.MapTo(Optimizer.FirstMomentEstimateOperation, firstMoment, gradient, firstMoment);
+ SpanOperations.MapTo(Optimizer.SecondMomentEstimateOperation, secondMoment, gradient, secondMoment);
+ SpanOperations.MapTo(Optimizer.WeightReductionOperation, weights, firstMoment, secondMoment, weights);
+ }
+
+ NumericsDebug.AssertValidNumbers(FirstMomentEmbeddingMatrix);
+ NumericsDebug.AssertValidNumbers(SecondMomentEmbeddingMatrix);
+
+ gradients.TouchedTokens.Clear(); // TODO: this should happen in Gradients.FullReset();
+ }
+
+ public void FullReset()
+ {
+ FirstMomentEmbeddingMatrix.ResetZero();
+ SecondMomentEmbeddingMatrix.ResetZero();
+ }
+ }
+}
diff --git a/ML.Core/Modules/IndexOutputLayer.cs b/ML.Core/Modules/IndexOutputLayer.cs
new file mode 100644
index 0000000..945eb63
--- /dev/null
+++ b/ML.Core/Modules/IndexOutputLayer.cs
@@ -0,0 +1,33 @@
+using ML.Core.Attributes;
+
+namespace ML.Core.Modules;
+
+[GeneratedModule(IncludeSerializer: true)]
+public sealed partial class IndexOutputLayer(int tokenCount, bool weightedRandom, Random? random = null) : IOutputModule
+{
+ [Property] public int TokenCount { get; } = tokenCount;
+ [Property] public bool WeightedRandom { get; } = weightedRandom;
+ public Random Random { get; } = random ?? Random.Shared;
+
+ public (int Output, float Confidence, Vector Weights) Forward(Vector input, EmptyModuleData snapshot)
+ {
+ Debug.Assert(input.Count == TokenCount);
+
+ var index = WeightedRandom ? GetWeightedRandomIndex(input, Random) : input.MaximumIndex();
+ return (index, input[index], input);
+ }
+
+ public Vector Backward(Vector outputGradient, EmptyModuleData snapshot, EmptyModuleData gradients) => outputGradient;
+
+ private static int GetWeightedRandomIndex(Vector weights, Random random)
+ {
+ var value = random.NextDouble();
+ for (int i = 0; i < weights.Count; i++)
+ {
+ value -= weights[i];
+ if (value < 0)
+ return i;
+ }
+ return weights.Count - 1;
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/Modules/Initialization/IModuleInitializer.cs b/ML.Core/Modules/Initialization/IModuleInitializer.cs
new file mode 100644
index 0000000..8d72f1d
--- /dev/null
+++ b/ML.Core/Modules/Initialization/IModuleInitializer.cs
@@ -0,0 +1,36 @@
+using ML.Core.Modules.Activations;
+
+namespace ML.Core.Modules.Initialization;
+
+public interface IModuleInitializer
+{
+ public IModule Init(IModule module);
+}
+
+public interface IModuleInitializer : IModuleInitializer
+ where TModule : IModule
+{
+ public TModule Init(TModule module);
+ IModule IModuleInitializer.Init(IModule module)
+ => Init(Guard.Is(module));
+}
+
+public sealed class EmptyModuleInitializer : IModuleInitializer
+{
+ public static EmptyModuleInitializer Instance => field ??= new();
+ public IModule Init(IModule module) => module;
+}
+
+public static class InitializationHelper
+{
+ public static Weight GetKaimingGain(IActivationModule n) => n switch
+ {
+ // SigmoidActivation => 1,
+ // TanhActivation => 5 / 3,
+ // ReLUActivation => Weight.Sqrt(2f),
+ LeakyReLUActivation l => Weight.Sqrt(2 / (1 + l.Alpha * l.Alpha)),
+ // Nonlinearity.GELU => Weight.Sqrt(2.0), // common approx
+ // Nonlinearity.Swish => Weight.Sqrt(2.0), // reasonable default
+ _ => throw new NotImplementedException(),
+ };
+}
\ No newline at end of file
diff --git a/ML.Core/Modules/ModuleDataPool.cs b/ML.Core/Modules/ModuleDataPool.cs
new file mode 100644
index 0000000..d1adeff
--- /dev/null
+++ b/ML.Core/Modules/ModuleDataPool.cs
@@ -0,0 +1,68 @@
+using System.Collections.Concurrent;
+
+namespace ML.Core.Modules;
+
+public sealed class ModuleDataPool(Func snapshotGetter, Func gradientGetter)
+{
+ private readonly ConcurrentStack gradientCache = [];
+ private readonly ConcurrentStack snaphotCache = [];
+
+ public int UnusedItems => gradientCache.Count;
+
+ public ModuleDataPool(IModule module)
+ : this(module.CreateSnapshot, module.CreateGradients)
+ {
+
+ }
+
+ public RentedSnapshotsMarker RentSnapshot()
+ {
+ var rented = snaphotCache.TryPop(out var snapshots) ? snapshots : snapshotGetter();
+ return new(this, rented);
+ }
+
+ public IModuleGradients RentGradients()
+ {
+ if (gradientCache.TryPop(out var gradients))
+ {
+ return gradients;
+ }
+
+ return gradientGetter();
+ }
+
+
+ public void Return(IModuleGradients gradients)
+ {
+ Debug.Assert(!gradientCache.Contains(gradients));
+ gradients.Reset();
+ gradientCache.Push(gradients);
+ }
+
+ public void Return(IModuleSnapshot snapshots)
+ {
+ Debug.Assert(!snaphotCache.Contains(snapshots));
+ // snapshots are always overriden, so no reset
+ snaphotCache.Push(snapshots);
+ }
+
+ public void Clear()
+ {
+ foreach(var snapshot in snaphotCache)
+ {
+ snapshot.Dispose();
+ }
+ snaphotCache.Clear();
+ gradientCache.Clear();
+ }
+
+ public readonly ref struct RentedSnapshotsMarker(ModuleDataPool pool, IModuleSnapshot snapshot)
+ {
+ public IModuleSnapshot Snapshot { get; } = snapshot;
+
+ public readonly void Dispose()
+ {
+ pool.Return(Snapshot);
+ }
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/Modules/PerceptronModule.cs b/ML.Core/Modules/PerceptronModule.cs
new file mode 100644
index 0000000..28a0dc8
--- /dev/null
+++ b/ML.Core/Modules/PerceptronModule.cs
@@ -0,0 +1,95 @@
+using System.Diagnostics.CodeAnalysis;
+using ML.Core.Attributes;
+using ML.Core.Modules.Activations;
+using ML.Core.Modules.Initialization;
+
+namespace ML.Core.Modules;
+
+[GeneratedModule(IncludeSerializer: true)]
+public sealed partial class PerceptronModule : IHiddenModule
+{
+ public int InputNodes => Weights.ColumnCount;
+ public int OutputNodes => Weights.RowCount;
+ [SubModule] public required IActivationModule Activation { get; init; }
+ [Weights] public Matrix Weights { get; }
+ [Weights] public Vector Biases { get; }
+
+ public PerceptronModule(int inputNodes, int outputNodes)
+ {
+ Weights = Matrix.Create(outputNodes, inputNodes);
+ Biases = Vector.Create(outputNodes);
+ }
+
+ [SetsRequiredMembers]
+ public PerceptronModule(IActivationModule activation, Matrix weights, Vector biases)
+ {
+ Debug.Assert(weights.RowCount == biases.Count);
+ Weights = weights;
+ Biases = biases;
+ Activation = activation;
+ }
+
+ public Vector Forward(Vector input, Snapshot snapshot)
+ {
+ Debug.Assert(input.Count == InputNodes);
+ snapshot.Input = input;
+ Weights.MultiplyTo(snapshot.Input, snapshot.Weighted);
+ snapshot.Weighted.AddTo(Biases, snapshot.Biased);
+ return Activation.Forward(snapshot.Biased, snapshot.Activation);
+ }
+
+ public Vector Backward(Vector outputGradient, Snapshot snapshot, Gradients gradients)
+ {
+ var biasedGradient = Activation.Backward(outputGradient, snapshot.Activation, gradients.Activation);
+ gradients.Biases.AddToSelf(biasedGradient);
+ VectorHelper.MultiplyToMatrixAddTo(biasedGradient, snapshot.Input, gradients.Weights);
+ Weights.MultiplyTransposedTo(biasedGradient, snapshot.InputGradient);
+ NumericsDebug.AssertValidNumbers(snapshot.InputGradient);
+ return snapshot.InputGradient;
+ }
+
+ partial class Snapshot
+ {
+ public Vector Input { get; set; }
+ public Vector Weighted { get; } = Vector.OfSize(module.Biases);
+ public Vector Biased { get; } = Vector.OfSize(module.Biases);
+ public Vector InputGradient { get; } = Vector.Create(module.InputNodes);
+ }
+
+ [GeneratedAdam(typeof(PerceptronModule))]
+ public sealed partial class Adam;
+
+ ///
+ /// suited for (Leaky)ReLU
+ /// not suited for SoftMax/Sigmoid
+ ///
+ public sealed class KaimingInitializer(IActivationModule activation) : IModuleInitializer
+ {
+ public Random Random { get; init; } = Random.Shared;
+ private readonly Weight gain = InitializationHelper.GetKaimingGain(activation);
+ public PerceptronModule Init(PerceptronModule module)
+ {
+ Debug.Assert(module.Activation is not SoftMaxActivation);
+ module.Weights.KaimingNormal(gain, Random);
+ module.Biases.Normal(0, 0.1f, Random);
+ return module;
+ }
+ }
+
+ ///
+ /// suited for SoftMax/Sigmoid
+ /// not suited for (Leaky)ReLU
+ ///
+ public sealed class XavierInitializer : IModuleInitializer
+ {
+ public static XavierInitializer Instance => field ??= new();
+ public Random Random { get; init; } = Random.Shared;
+ public PerceptronModule Init(PerceptronModule module)
+ {
+ Debug.Assert(module.Activation is not LeakyReLUActivation);
+ module.Weights.XavierUniform(Random);
+ module.Biases.Normal(0, 0.1f, Random);
+ return module;
+ }
+ }
+}
diff --git a/ML.Core/Modules/SequenceModule.cs b/ML.Core/Modules/SequenceModule.cs
new file mode 100644
index 0000000..3d1fe0b
--- /dev/null
+++ b/ML.Core/Modules/SequenceModule.cs
@@ -0,0 +1,120 @@
+using System.Runtime.InteropServices;
+using Ametrin.Serializer;
+using ML.Core.Modules.Initialization;
+
+namespace ML.Core.Modules;
+
+public sealed class SequenceModule : IHiddenModule.Snapshot, SequenceModule.Gradients>
+{
+ public required ImmutableArray> Inner { get; init; }
+
+ public TArch Forward(TArch input, Snapshot snapshot)
+ {
+ Debug.Assert(Inner.Length == snapshot.Inner.Length);
+ return Inner.Zip(snapshot.Inner).Aggregate(input, static (input, m) => m.First.Forward(input, m.Second));
+ }
+
+ public TArch Backward(TArch outputGradient, Snapshot snapshot, Gradients gradients)
+ {
+ Debug.Assert(Inner.Length == snapshot.Inner.Length);
+ Debug.Assert(Inner.Length == gradients.Inner.Length);
+
+ foreach (var i in Inner.IndexRange.Reversed())
+ {
+ outputGradient = Inner[i].Backward(outputGradient, snapshot.Inner[i], gradients.Inner[i]);
+ }
+
+ return outputGradient;
+ }
+
+ public ulong ParameterCount => Inner.Sum(static m => m.ParameterCount);
+ public Snapshot CreateSnapshot() => new(this);
+ public Gradients CreateGradients() => new(this);
+
+ public sealed class Snapshot(SequenceModule module) : IModuleSnapshot
+ {
+ public ImmutableArray Inner { get; } = [.. module.Inner.Select(static m => m.CreateSnapshot())];
+
+ public void Dispose()
+ {
+ Inner.ForEach(static i => i.Dispose());
+ }
+ }
+
+ public sealed class Gradients(SequenceModule module) : IModuleGradients
+ {
+ public ImmutableArray Inner { get; } = [.. module.Inner.Select(static m => m.CreateGradients())];
+
+ public void Add(Gradients other)
+ {
+ Debug.Assert(Inner.Length == other.Inner.Length);
+ foreach (var (left, right) in Inner.Zip(other.Inner))
+ {
+ left.Add(right);
+ }
+ }
+
+ public void Reset()
+ {
+ Inner.ForEach(static m => m.Reset());
+ }
+ }
+
+ static SequenceModule()
+ {
+ Training.AdamOptimizer.Registry.Register>(static (o, module) => new Adam(o, module));
+ }
+
+ public sealed class Adam(Training.AdamOptimizer optimizer, SequenceModule module) : Training.IModuleOptimizer
+ {
+ public ImmutableArray SubOptimizers { get; } = [.. module.Inner.Select(optimizer.CreateModuleOptimizer)];
+ public Training.AdamOptimizer Optimizer { get; } = optimizer;
+
+ public void Apply(Gradients gradients)
+ {
+ Debug.Assert(gradients.Inner.Length == SubOptimizers.Length);
+ SubOptimizers.Zip(gradients.Inner).ForEach(static p => p.First.Apply(p.Second));
+ }
+
+ public void FullReset()
+ {
+ SubOptimizers.ForEach(static sub => sub.FullReset());
+ }
+ }
+
+ public sealed class SharedInitializer : IModuleInitializer>
+ {
+ public IModuleInitializer Inner { get; init; } = EmptyModuleInitializer.Instance;
+
+ public SequenceModule Init(SequenceModule module)
+ {
+ module.Inner.ForEach(m => Inner.Init(m));
+ return module;
+ }
+ }
+
+ public sealed class Initializer : IModuleInitializer>
+ {
+ public required ImmutableArray Inner { get; init; }
+
+ public SequenceModule Init(SequenceModule module)
+ {
+ module.Inner.Zip(Inner).ForEach(static p => p.Second.Init(p.First));
+ return module;
+ }
+ }
+}
+
+public sealed class SequenceModuleConverter : ISerializationConverter>
+{
+ public static Result, DeserializationError> TryReadValue(IAmetrinReader reader)
+ {
+ var modules = reader.TryReadArrayValue(AmetrinSerializer.TryReadDynamic>);
+ return modules.Map(static modules => new SequenceModule { Inner = ImmutableCollectionsMarshal.AsImmutableArray(modules) });
+ }
+
+ public static void WriteValue(IAmetrinWriter writer, SequenceModule value)
+ {
+ writer.WriteArrayValue(value.Inner.AsSpan(), AmetrinSerializer.WriteDynamic);
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/ThreadingMode.cs b/ML.Core/ThreadingMode.cs
new file mode 100644
index 0000000..6cf2985
--- /dev/null
+++ b/ML.Core/ThreadingMode.cs
@@ -0,0 +1,3 @@
+namespace ML.Core;
+
+public enum ThreadingMode { Single, Half, AlmostFull, Full }
\ No newline at end of file
diff --git a/ML.Core/Training/AdamOptimizer.cs b/ML.Core/Training/AdamOptimizer.cs
new file mode 100644
index 0000000..f28c5c2
--- /dev/null
+++ b/ML.Core/Training/AdamOptimizer.cs
@@ -0,0 +1,78 @@
+namespace ML.Core.Training;
+
+public sealed class AdamOptimizer : Optimizer
+{
+ public static ModuleOptimizerRegistry Registry { get; } = [];
+ protected override ModuleOptimizerRegistry RegistryGetter => Registry;
+ public Weight FirstDecayRate { get; init; } = 0.9f;
+ public Weight SecondDecayRate { get; init; } = 0.99f; // or 0.999f
+ public Weight Epsilon { get; init; } = 1e-8f;
+
+ public Weight Iteration
+ {
+ get;
+ set
+ {
+ field = value;
+ CurrentFirstCorrection = 1 - Weight.Pow(FirstDecayRate, Iteration);
+ CurrentSecondCorrection = 1 - Weight.Pow(SecondDecayRate, Iteration);
+ WeightReductionOperation = new(this);
+ }
+ }
+
+ public AdamFirstMomentEstimateOperation FirstMomentEstimateOperation { get; private set; }
+ public AdamSecondMomentEstimateOperation SecondMomentEstimateOperation { get; private set; }
+ public AdamWeightReductionOperation WeightReductionOperation { get; private set; }
+
+ public Weight CurrentFirstCorrection { get; private set; }
+ public Weight CurrentSecondCorrection { get; private set; }
+
+
+ public override void Init()
+ {
+ Iteration = 1; // even when retraining!
+ FirstMomentEstimateOperation = new(FirstDecayRate);
+ SecondMomentEstimateOperation = new(SecondDecayRate);
+ }
+
+ public override void OnBatchCompleted()
+ {
+ Iteration++;
+ }
+
+ public readonly struct AdamFirstMomentEstimateOperation(Weight decayRate) : IBinaryOperator
+ {
+ private readonly Weight decayRate = decayRate;
+
+ public static Weight Invoke(in AdamFirstMomentEstimateOperation state, Weight lastMoment, Weight gradient) => state.decayRate * lastMoment + (1 - state.decayRate) * gradient;
+ public static SimdVector Invoke(in AdamFirstMomentEstimateOperation state, SimdVector lastMoment, SimdVector gradient) => state.decayRate * lastMoment + (1 - state.decayRate) * gradient;
+ }
+
+ public readonly struct AdamSecondMomentEstimateOperation(Weight decayRate) : IBinaryOperator
+ {
+ private readonly Weight decayRate = decayRate;
+ public static Weight Invoke(in AdamSecondMomentEstimateOperation state, Weight lastMoment, Weight gradient) => state.decayRate * lastMoment + (1 - state.decayRate) * gradient * gradient;
+ public static SimdVector Invoke(in AdamSecondMomentEstimateOperation state, SimdVector lastMoment, SimdVector gradient) => state.decayRate * lastMoment + (1 - state.decayRate) * gradient * gradient;
+ }
+
+ public readonly struct AdamWeightReductionOperation(AdamOptimizer context) : ITernaryOperator
+ {
+ private readonly Weight learningRate = context.LearningRate;
+ private readonly Weight firstMomentCorrection = context.CurrentFirstCorrection;
+ private readonly Weight secondMomentCorrection = context.CurrentSecondCorrection;
+ private readonly Weight epsilon = context.Epsilon;
+
+ public static Weight Invoke(in AdamWeightReductionOperation state, Weight currentWeight, Weight firstMoment, Weight secondMoment)
+ {
+ var mHat = firstMoment / state.firstMomentCorrection;
+ var vHat = secondMoment / state.secondMomentCorrection;
+ return currentWeight - (state.learningRate * mHat / (Weight.Sqrt(vHat) + state.epsilon));
+ }
+ public static SimdVector Invoke(in AdamWeightReductionOperation state, SimdVector currentWeight, SimdVector firstMoment, SimdVector secondMoment)
+ {
+ var mHat = firstMoment / state.firstMomentCorrection;
+ var vHat = secondMoment / state.secondMomentCorrection;
+ return currentWeight - (state.learningRate * mHat / (SimdVectorHelper.SquareRoot(vHat) + SimdVectorHelper.Create(state.epsilon)));
+ }
+ }
+}
diff --git a/ML.Core/Training/IModuleOptimizer.cs b/ML.Core/Training/IModuleOptimizer.cs
new file mode 100644
index 0000000..b6af1f3
--- /dev/null
+++ b/ML.Core/Training/IModuleOptimizer.cs
@@ -0,0 +1,25 @@
+using ML.Core.Modules;
+
+namespace ML.Core.Training;
+
+public interface IModuleOptimizer
+{
+ public void Apply(IModuleGradients gradients);
+ public void FullReset();
+};
+
+public interface IModuleOptimizer : IModuleOptimizer
+ where TGradients : IModuleGradients
+{
+ public void Apply(TGradients gradients);
+ void IModuleOptimizer.Apply(IModuleGradients gradients)
+ => Apply(Guard.Is(gradients));
+}
+
+public sealed class EmptyModuleOptimizer : IModuleOptimizer
+{
+ public static EmptyModuleOptimizer Instance { get; } = new();
+
+ public void Apply(EmptyModuleData gradients) { }
+ public void FullReset() { }
+}
\ No newline at end of file
diff --git a/ML.Core/Training/ModuleTrainer.cs b/ML.Core/Training/ModuleTrainer.cs
new file mode 100644
index 0000000..845a15d
--- /dev/null
+++ b/ML.Core/Training/ModuleTrainer.cs
@@ -0,0 +1,193 @@
+using System.Buffers;
+using System.Text;
+using System.Threading;
+using ML.Core.Evaluation;
+using ML.Core.Evaluation.Cost;
+using ML.Core.Modules;
+using ML.Core.Data.Training;
+
+namespace ML.Core.Training;
+
+public sealed class EmbeddedModuleTrainer
+ where TArch : ITensorLike
+{
+ public EmbeddedModule Module { get; }
+ public TrainingConfig Config { get; }
+ public required ITrainingDataSource> TrainingData { get; init; }
+ public required ICostFunction CostFunction { get; init; }
+ public ModuleDataPool DataPool { get; }
+
+ private Optimizer Optimizer => Config.Optimizer;
+ private readonly IModuleOptimizer moduleOptimizer;
+
+ public EmbeddedModuleTrainer(EmbeddedModule module, TrainingConfig config)
+ {
+ Module = module;
+ Config = config;
+ moduleOptimizer = Optimizer.CreateModuleOptimizer(module);
+ DataPool = new(module);
+ }
+
+ public void TrainConsole(bool cancelable = true)
+ {
+ using var cts = new CancellationTokenSource();
+ if (cancelable)
+ {
+ Task.Run(async () =>
+ {
+ while (!cts.IsCancellationRequested)
+ {
+ if (Console.KeyAvailable && Console.ReadKey(intercept: true).Key == ConsoleKey.C)
+ {
+ Console.WriteLine("Canceling...");
+ cts.Cancel();
+ break;
+ }
+ await Task.Delay(500);
+ }
+ });
+ }
+
+ Console.WriteLine($"Training {Module} ({Module.ParameterCount})");
+ Console.WriteLine(GenerateTrainingOverview(Config, TrainingData.BatchCount, TrainingData.BatchSize));
+ Console.WriteLine("Starting Training...");
+ Console.WriteLine(TrainingEvaluationResult.GetHeader());
+ Train(cts.Token);
+ cts.Cancel();
+ Console.WriteLine("Training Done!");
+ }
+
+
+ public void Train(CancellationToken token = default)
+ {
+ Optimizer.Init();
+ var runningEvaluation = EvaluationResult.ZERO;
+
+ foreach (var epochIndex in ..Config.EpochCount)
+ {
+ TrainingData.Reset();
+
+ foreach (var (batchIndex, batch) in TrainingData.GetBatches().Index())
+ {
+ runningEvaluation += RunBatch(batch);
+
+ if ((Config.BatchEvaluationEnabled && batchIndex % Config.EvaluationCallbackAfterBatches is 0)
+ || (batchIndex + 1 == TrainingData.BatchCount && Config.EpochEvaluationEnabled))
+ {
+ Config.EvaluationCallback!.Invoke(new TrainingEvaluationResult { Context = GetContext(), Result = runningEvaluation });
+ runningEvaluation = EvaluationResult.ZERO;
+ }
+
+ Optimizer.OnBatchCompleted();
+
+ if (token.IsCancellationRequested)
+ {
+ Optimizer.OnEpochCompleted();
+ return;
+ }
+
+ TrainingEvaluationContext GetContext() => new()
+ {
+ CurrentBatch = batchIndex + 1,
+ MaxBatch = TrainingData.BatchCount,
+ CurrentEpoch = epochIndex + 1,
+ MaxEpoch = Config.EpochCount,
+ LearningRate = Optimizer.LearningRate,
+ };
+ }
+
+ Optimizer.OnEpochCompleted();
+ }
+ }
+
+ public EvaluationResult RunBatch(IEnumerable> batch)
+ {
+ var timeStamp = Stopwatch.GetTimestamp();
+
+ using var context = ThreadedTrainer.Train(batch, DataPool, Config.Threading, (entry, context) =>
+ {
+ var (output, condfidence, cost) = RunEntry(entry, (EmbeddedModule.Gradients)context.Gradients);
+ if (EqualityComparer.Default.Equals(output, entry.ExpectedValue))
+ {
+ context.CorrectCount++;
+ context.CorrectConfidenceSum += condfidence;
+ }
+ else
+ {
+ context.WrongConfidenceSum += condfidence;
+ }
+
+ context.TotalCount++;
+ context.TotalCost += cost;
+ });
+
+
+ moduleOptimizer.Apply(context.Gradients);
+
+ return new()
+ {
+ TotalCount = context.TotalCount,
+ CorrectCount = context.CorrectCount,
+ CorrectConfidenceSum = context.CorrectConfidenceSum,
+ WrongConfidenceSum = context.WrongConfidenceSum,
+ TotalCost = context.TotalCost,
+ TotalElapsedTime = Stopwatch.GetElapsedTime(timeStamp),
+ };
+ }
+
+ private (TOut output, Weight confidence, Weight cost) RunEntry(TrainingEntry entry, EmbeddedModule.Gradients gradients)
+ {
+ using var marker = DataPool.RentSnapshot();
+ var snapshot = (EmbeddedModule.Snapshot)marker.Snapshot;
+
+ var (output, condfidence, outputWeights) = Module.Forward(entry.InputValue, snapshot);
+
+ NumericsDebug.AssertSameDimensions(outputWeights, entry.ExpectedWeights);
+ using var outputGradientStorage = ArrayPool.Shared.RentNumerics(outputWeights.FlatCount);
+ var outputGradient = TArch.OfSize(outputWeights, outputGradientStorage);
+ CostFunction.DerivativeTo(outputWeights, entry.ExpectedWeights, outputGradient);
+
+ var inputGradient = Module.Backward(outputGradient, snapshot, gradients);
+
+ return (output, condfidence, CostFunction.TotalCost(outputWeights, entry.ExpectedWeights));
+ }
+
+ public void FullReset()
+ {
+ moduleOptimizer.FullReset();
+ }
+
+ public static string GenerateTrainingOverview(TrainingConfig config, int batchCount, int batchSize)
+ {
+ var sb = new StringBuilder();
+ sb.AppendLine();
+ sb.AppendLine("Training Info:");
+ sb.AppendLine($"using {config.Optimizer.GetType().Name} ({config.Threading})");
+ sb.AppendLine("Training for");
+ sb.AppendLine($" - {config.EpochCount} epochs");
+ sb.AppendLine($" - {batchCount} batches");
+ sb.AppendLine($" - {batchSize} entries");
+
+ if (config.EvaluationCallbackEnabled)
+ {
+ if (config.BatchEvaluationEnabled)
+ {
+ if (config.EvaluationCallbackAfterBatches == 1)
+ {
+ sb.AppendLine("Dumping every batch");
+ }
+ else
+ {
+ sb.AppendLine($"Dumping every {config.EvaluationCallbackAfterBatches} batches");
+ }
+ }
+ else
+ {
+ sb.AppendLine($"Dumping every epoch");
+ }
+ }
+
+ sb.AppendLine();
+ return sb.ToString();
+ }
+}
\ No newline at end of file
diff --git a/ML.Core/Training/Optimizer.cs b/ML.Core/Training/Optimizer.cs
new file mode 100644
index 0000000..82279e7
--- /dev/null
+++ b/ML.Core/Training/Optimizer.cs
@@ -0,0 +1,33 @@
+using ML.Core.Modules;
+
+namespace ML.Core.Training;
+
+public abstract class Optimizer
+{
+ public required Weight LearningRate { get; set; }
+
+ public virtual void Init() { }
+ public virtual void OnBatchCompleted() { }
+ public virtual void OnEpochCompleted() { }
+
+ protected abstract ModuleOptimizerRegistry RegistryGetter { get; }
+ public IModuleOptimizer CreateModuleOptimizer(IModule module)
+ {
+ if (RegistryGetter.TryGetValue(module.GetType(), out var factory))
+ {
+ return factory(this, module);
+ }
+
+ throw new NotImplementedException($"No known {GetType().Name} for {module.GetType().Name}");
+ }
+}
+
+public class ModuleOptimizerRegistry : Dictionary>;
+public sealed class ModuleOptimizerRegistry : ModuleOptimizerRegistry
+{
+ public void Register(Func factory) where TModule : IModule
+ => Add(typeof(TModule), (op, layer) => factory(Guard.Is(op), Guard.Is(layer)));
+
+ public void RegisterEmpty() where TModule : IModule
+ => Add(typeof(TModule), static (_, _) => EmptyModuleOptimizer.Instance);
+}
\ No newline at end of file
diff --git a/MachineLearning.Training/ThreadedTrainer.cs b/ML.Core/Training/ThreadedTrainer.cs
similarity index 59%
rename from MachineLearning.Training/ThreadedTrainer.cs
rename to ML.Core/Training/ThreadedTrainer.cs
index a2a5f0e..744d2c0 100644
--- a/MachineLearning.Training/ThreadedTrainer.cs
+++ b/ML.Core/Training/ThreadedTrainer.cs
@@ -1,15 +1,14 @@
using System.Collections.Concurrent;
-using System.Collections.Immutable;
-using MachineLearning.Data.Entry;
-using MachineLearning.Model.Layer.Snapshot;
+using System.Threading;
+using ML.Core.Modules;
-namespace MachineLearning.Training;
+namespace ML.Core.Training;
public sealed class ThreadedTrainer
{
- public static TrainingContext Train(IEnumerable trainingSet, ModelCachePool contextPool, ThreadingMode threading, Action action)
+ public static TrainingContext Train(IEnumerable trainingSet, ModuleDataPool contextPool, ThreadingMode threading, Action action)
{
- using var contexts = new ThreadLocal(() => new() { Gradients = contextPool.RentGradients() }, trackAllValues: true);
+ using var contexts = new ThreadLocal(() => new() { Pool = contextPool }, trackAllValues: true);
var options = new ParallelOptions
{
MaxDegreeOfParallelism = threading switch
@@ -21,8 +20,7 @@ public static TrainingContext Train(IEnumerable trainingSet, Model
_ => throw new UnreachableException()
},
};
- var partitioner = Partitioner.Create(trainingSet);
- var result = Parallel.ForEach(partitioner, options, (item, state) =>
+ var result = Parallel.ForEach(trainingSet, options, (item, state) =>
{
action(item, contexts.Value!);
});
@@ -34,43 +32,44 @@ public static TrainingContext Train(IEnumerable trainingSet, Model
foreach (var other in contexts.Values.Skip(1))
{
context.Add(other);
- contextPool.Return(other.Gradients);
+ other.Dispose();
}
return context;
}
}
-public enum ThreadingMode { Single, Half, Full, AlmostFull }
-
-public sealed class TrainingContext
+public sealed class TrainingContext : IDisposable
{
public int TotalCount { get; set; }
public int CorrectCount { get; set; }
+ public float CorrectConfidenceSum { get; set; }
+ public float WrongConfidenceSum { get; set; }
public float TotalCost { get; set; }
- public required ImmutableArray Gradients { get; init; }
+ public required ModuleDataPool Pool { get; init; }
+ private IModuleGradients? _gradients;
+ public IModuleGradients Gradients => _gradients ??= Pool.RentGradients();
public void Add(TrainingContext other)
{
TotalCount += other.TotalCount;
CorrectCount += other.CorrectCount;
+ CorrectConfidenceSum += other.CorrectConfidenceSum;
+ WrongConfidenceSum += other.WrongConfidenceSum;
TotalCost += other.TotalCost;
- foreach (var (g, o) in Gradients.Zip(other.Gradients))
- {
- g.Add(o);
- }
+ Gradients.Add(other.Gradients);
}
- public void Reset()
+ public void Dispose()
{
TotalCount = 0;
CorrectCount = 0;
TotalCost = 0;
- foreach (var gradient in Gradients)
+ if (_gradients is not null)
{
- gradient.Reset();
+ Pool.Return(_gradients);
}
}
-}
+}
\ No newline at end of file
diff --git a/ML.Core/Training/TrainingConfig.cs b/ML.Core/Training/TrainingConfig.cs
new file mode 100644
index 0000000..47ed337
--- /dev/null
+++ b/ML.Core/Training/TrainingConfig.cs
@@ -0,0 +1,17 @@
+using ML.Core.Evaluation;
+
+namespace ML.Core.Training;
+
+public sealed record TrainingConfig
+{
+ public required int EpochCount { get; init; }
+
+ public required Optimizer Optimizer { get; init; }
+ public ThreadingMode Threading { get; init; } = ThreadingMode.Full;
+
+ public Action? EvaluationCallback { get; init; } = null;
+ public bool EvaluationCallbackEnabled => EvaluationCallback is not null;
+ public bool EpochEvaluationEnabled => EvaluationCallbackEnabled && !BatchEvaluationEnabled;
+ public int EvaluationCallbackAfterBatches { get; init; } = -1;
+ public bool BatchEvaluationEnabled => EvaluationCallbackEnabled && EvaluationCallbackAfterBatches > 0;
+}
diff --git a/MachineLearning.Samples/AssetManager.cs b/ML.Runner/AssetManager.cs
similarity index 75%
rename from MachineLearning.Samples/AssetManager.cs
rename to ML.Runner/AssetManager.cs
index 1b81bf8..4ac28db 100644
--- a/MachineLearning.Samples/AssetManager.cs
+++ b/ML.Runner/AssetManager.cs
@@ -1,8 +1,11 @@
-namespace MachineLearning.Samples;
+using System.IO;
+using ML.Core.Converters;
+
+namespace ML.Runner;
public static class AssetManager
{
- public static readonly DirectoryInfo Directory = new DirectoryInfo(@"I:\Coding\TestChamber\MachineLearning");
+ public static readonly DirectoryInfo Directory = new(@"I:/Coding/TestChamber/MachineLearning");
public static readonly DirectoryInfo ModelDirectory = Directory.Directory("Model");
public static readonly DirectoryInfo WeightMapsDirectory = Directory.Directory("Maps");
public static readonly DirectoryInfo DataDirectory = Directory.Directory("Data");
@@ -11,7 +14,7 @@ public static class AssetManager
public static readonly FileInfo Sentences = GetDataFile("sentences.txt");
public static readonly FileInfo Speech = GetDataFile("speech.txt");
- public static FileInfo GetModelFile(string fileName) => ModelDirectory.File(fileName.EndsWith(ModelSerializer.FILE_EXTENSION) ? fileName : $"{fileName}{ModelSerializer.FILE_EXTENSION}");
+ public static FileInfo GetModelFile(string fileName) => ModelDirectory.File(fileName.EndsWith(ModuleSerializer.FILE_EXTENSION) ? fileName : $"{fileName}{ModuleSerializer.FILE_EXTENSION}");
public static FileInfo GetDataFile(string fileName) => DataDirectory.File(fileName);
public static DirectoryInfo GetWeightMapFolder(string modelName) => WeightMapsDirectory.Directory(modelName);
-}
+}
\ No newline at end of file
diff --git a/ML.Runner/ML.Runner.csproj b/ML.Runner/ML.Runner.csproj
new file mode 100644
index 0000000..04a4b2d
--- /dev/null
+++ b/ML.Runner/ML.Runner.csproj
@@ -0,0 +1,13 @@
+
+
+
+ Exe
+ $(DotNetVersion)
+
+
+
+
+
+
+
+
diff --git a/ML.Runner/Program.cs b/ML.Runner/Program.cs
new file mode 100644
index 0000000..ac60a80
--- /dev/null
+++ b/ML.Runner/Program.cs
@@ -0,0 +1,15 @@
+using System.Globalization;
+using ML.Runner.Samples.Language;
+using ML.Runner.Samples.Mnist;
+
+CultureInfo.CurrentCulture = CultureInfo.InvariantCulture;
+
+// var random = Random.Shared;
+var random = new Random(69);
+
+MnistModel.Run(random);
+
+#if DEBUG
+// forces all remaining finalizers to be called to make sure all have been disposed
+GC.Collect();
+#endif
\ No newline at end of file
diff --git a/MachineLearning.Samples/Language/C4DataSet.cs b/ML.Runner/Samples/Language/C4DataSet.cs
similarity index 86%
rename from MachineLearning.Samples/Language/C4DataSet.cs
rename to ML.Runner/Samples/Language/C4DataSet.cs
index fbba569..a564078 100644
--- a/MachineLearning.Samples/Language/C4DataSet.cs
+++ b/ML.Runner/Samples/Language/C4DataSet.cs
@@ -1,10 +1,13 @@
+using System.IO;
using System.IO.Compression;
+using System.Net.Http;
using System.Text.Json;
-using MachineLearning.Data;
+using ML.Core.Data;
+using ML.Core.Data.Training;
-namespace MachineLearning.Samples.Language;
+namespace ML.Runner.Samples.Language;
-public sealed class C4DataSet(ITokenizer tokenizer, int contextSize, int initalFile = 0) : ITrainingSet, IDisposable
+public sealed class C4DataSet(ITokenizer tokenizer, int contextSize, int initalFile = 0) : ITrainingDataSource>, IDisposable
{
public int BatchCount { get; } = int.MaxValue;
public required int BatchSize { get; init; }
@@ -18,16 +21,16 @@ public sealed class C4DataSet(ITokenizer tokenizer, int contextSize, int
private Task downloadTask = Download(initalFile);
- public IEnumerable GetBatches()
+ public IEnumerable>> GetBatches()
{
while (true)
{
- yield return new Batch(GetTrainingData().Take(BatchSize));
+ yield return GetTrainingData().Take(BatchSize);
}
}
- private IEnumerator? dataEnumerator;
- public IEnumerable GetTrainingData()
+ private IEnumerator>? dataEnumerator;
+ public IEnumerable> GetTrainingData()
{
while (true)
{
@@ -35,7 +38,7 @@ public IEnumerable GetTrainingData()
{
try
{
- dataEnumerator = tokenizer.Tokenize(NextLine()).ToArray().SlidingWindow(tokenizer.TokenizeSingle("\0"), contextSize).ToTrainingDataMatrix(tokenizer.TokenCount, contextSize, null).GetEnumerator();
+ dataEnumerator = tokenizer.Tokenize(NextLine()).ToArray().SlidingWindow(tokenizer.TokenizeSingle("\0"), contextSize).ToTrainingData(tokenizer.TokenCount).GetEnumerator();
}
catch (Exception) { /* Console.WriteLine(e.Message); */ }
}
@@ -112,6 +115,11 @@ public static async Task Download(int fileIndex)
return file;
}
+ public void Reset()
+ {
+
+ }
+
private sealed class C4FileReader : IDisposable
{
private readonly Stream stream;
diff --git a/MachineLearning.Samples/Language/CharTokenizer.cs b/ML.Runner/Samples/Language/CharTokenizer.cs
similarity index 90%
rename from MachineLearning.Samples/Language/CharTokenizer.cs
rename to ML.Runner/Samples/Language/CharTokenizer.cs
index a7791ef..67a8b2c 100644
--- a/MachineLearning.Samples/Language/CharTokenizer.cs
+++ b/ML.Runner/Samples/Language/CharTokenizer.cs
@@ -1,7 +1,6 @@
-using MachineLearning.Data;
-using System.Diagnostics;
+using ML.Core.Data;
-namespace MachineLearning.Samples.Language;
+namespace ML.Runner.Samples.Language;
public sealed class CharTokenizer(string tokens) : ITokenizer
{
diff --git a/MachineLearning.Samples/Language/LMHelper.cs b/ML.Runner/Samples/Language/LMHelper.cs
similarity index 72%
rename from MachineLearning.Samples/Language/LMHelper.cs
rename to ML.Runner/Samples/Language/LMHelper.cs
index 8501e2c..50b18f8 100644
--- a/MachineLearning.Samples/Language/LMHelper.cs
+++ b/ML.Runner/Samples/Language/LMHelper.cs
@@ -1,11 +1,12 @@
-using MachineLearning.Data;
+using ML.Core.Data;
+using ML.Core.Modules;
-namespace MachineLearning.Samples.Language;
+namespace ML.Runner.Samples.Language;
public static class LMHelper
{
private static readonly HashSet EndTokens = ["\0"];
- public static void StartChat(IEmbeddedModel model, int contextSize, ITokenizer tokenizer)
+ public static void StartChat(IEmbeddedModule model, int contextSize, ITokenizer tokenizer)
{
var fillerToken = tokenizer.TokenizeSingle("\0");
string input;
@@ -18,14 +19,14 @@ public static void StartChat(IEmbeddedModel model, int contextSize,
}
if (!Console.IsOutputRedirected)
{
- Console.SetCursorPosition(0, Console.CursorTop - 1);
+ Console.SetCursorPosition(0, Console.CursorTop - 1);
}
Console.Write(input);
Generate([.. tokenizer.Tokenize(input)], model, contextSize, tokenizer, fillerToken);
} while (true);
}
- public static void Generate(int[] input, IEmbeddedModel model, int contextSize, ITokenizer tokenizer, int fillerToken)
+ public static void Generate(int[] input, IEmbeddedModule model, int contextSize, ITokenizer tokenizer, int fillerToken)
{
if (input.Contains(-1))
{
@@ -36,9 +37,10 @@ public static void Generate(int[] input, IEmbeddedModel model, int c
int prediction;
string token;
Weight confidence;
+ using var snapshot = model.CreateSnapshot();
do
{
- (prediction, confidence) = model.Process(input);
+ (prediction, confidence) = model.Forward(input, snapshot);
token = tokenizer.GetToken(prediction);
input = input[0] == fillerToken ? [.. input[1..], prediction] : [.. input, prediction];
SetConsoleTextColor(confidence);
@@ -48,9 +50,9 @@ public static void Generate(int[] input, IEmbeddedModel model, int c
Console.Write("\u001b[0m"); // reset color
Console.WriteLine();
- static void SetConsoleTextColor(double confidence)
+ static void SetConsoleTextColor(Weight confidence)
{
Console.Write($"\u001b[38;2;{(1 - confidence) * 255:F0};{confidence * 255:F0};60m");
}
}
-}
+}
\ No newline at end of file
diff --git a/MachineLearning.Samples/Language/LanguageDataHelper.cs b/ML.Runner/Samples/Language/LanguageDataHelper.cs
similarity index 63%
rename from MachineLearning.Samples/Language/LanguageDataHelper.cs
rename to ML.Runner/Samples/Language/LanguageDataHelper.cs
index 29ccf5a..20c999a 100644
--- a/MachineLearning.Samples/Language/LanguageDataHelper.cs
+++ b/ML.Runner/Samples/Language/LanguageDataHelper.cs
@@ -1,34 +1,14 @@
-using System.Collections.Frozen;
-using System.Diagnostics;
+using System;
+using System.IO;
using System.Text;
-using MachineLearning.Data;
+using ML.Core.Data;
+using ML.Core.Data.Training;
-namespace MachineLearning.Samples.Language;
+namespace ML.Runner.Samples.Language;
public static class LanguageDataHelper
{
- public static IEnumerable ToTrainingData(this IEnumerable> source, ITokenizer tokenizer)
- {
- var cache = Enumerable.Range(0, tokenizer.TokenCount).Select(i =>
- {
- var vector = Vector.Create(tokenizer.TokenCount);
- vector[i] = 1;
- return new KeyValuePair(i, vector);
- }).ToFrozenDictionary();
-
- return source.Select(MapData);
-
-
- TrainingData MapData(DataEntry e)
- {
- var input = tokenizer.Tokenize(e.Input).ToArray();
- var expectedToken = tokenizer.TokenizeSingle(e.Expected.ToString());
-
- return new TrainingData(input, expectedToken, cache[expectedToken]);
- }
- }
-
- public static IEnumerable ToTrainingData(this IEnumerable> source, int tokenCount)
+ public static IEnumerable> ToTrainingData(this IEnumerable<(int[] Input, int Expected)> source, int tokenCount)
{
var cache = Enumerable.Range(0, tokenCount).Select(i =>
{
@@ -39,14 +19,14 @@ public static IEnumerable ToTrainingData(this IEnumerable e)
+ TrainingEntry MapData((int[] Input, int Expected) e)
{
- return new TrainingData(e.Input, e.Expected, cache[e.Expected]);
+ return new (e.Input, cache[e.Expected], e.Expected);
}
}
- public static IEnumerable ToTrainingDataMatrix(this IEnumerable<(int[] Input, int Expected)> source, int tokenCount, int contextSize, int? fillerToken)
+ public static IEnumerable> ToTrainingDataMatrix(this IEnumerable<(int[] Input, int Expected)> source, int tokenCount, int contextSize, int? fillerToken)
{
var cache = Enumerable.Range(0, tokenCount).Select(i =>
{
@@ -55,13 +35,13 @@ public static IEnumerable ToTrainingDataMatrix(this IEnumerable<(i
return new KeyValuePair(i, vector);
}).ToFrozenDictionary();
- return source.Where(e => e.Input.Length > 0).Select(MapData);
+ return source.Where(static e => e.Input.Length > 0).Select(MapData);
- TrainingData MapData((int[] Input, int Expected) e)
+ TrainingEntry MapData((int[] Input, int Expected) e)
{
return fillerToken.HasValue ? ImplFiller(fillerToken.Value) : Impl();
- TrainingData Impl()
+ TrainingEntry Impl()
{
var length = int.Min(e.Input.Length, contextSize);
var expected = Matrix.Create(length, tokenCount);
@@ -75,11 +55,11 @@ TrainingData Impl()
cache[e.Expected].CopyTo(expected.RowRef(length - 1));
- return new TrainingData(fillerToken.HasValue ? e.Input.PadLeft(contextSize, fillerToken.Value) : e.Input, e.Expected, expected.Storage);
+ return new(fillerToken.HasValue ? e.Input.PadLeft(contextSize, fillerToken.Value) : e.Input, expected, e.Expected);
}
// filling with a filler in this way is probably bad but i'll use dynamic input size anyway
- TrainingData ImplFiller(int filler)
+ TrainingEntry ImplFiller(int filler)
{
var length = contextSize;
var expected = Matrix.Create(length, tokenCount);
@@ -101,7 +81,7 @@ TrainingData ImplFiller(int filler)
cache[e.Expected].CopyTo(expected.RowRef(length - 1));
- return new TrainingData(fillerToken.HasValue ? e.Input.PadLeft(contextSize, fillerToken.Value) : e.Input, e.Expected, expected.Storage);
+ return new(fillerToken.HasValue ? e.Input.PadLeft(contextSize, fillerToken.Value) : e.Input, expected, e.Expected);
}
}
@@ -132,30 +112,24 @@ public static IEnumerable TokenizeSkipInvalid(this IEnumerable so
}
}
- public static IEnumerable> SentencesData(int contextSize)
- => GetLines(AssetManager.Sentences.FullName).InContextSize(contextSize).ExpandPerChar();
-
- public static IEnumerable> SpeechData(int contextSize)
- => GetLines(AssetManager.Speech.FullName).SlidingWindow(contextSize);
-
public static IEnumerable InContextSize(this IEnumerable data, int contextSize)
=> data.Where(s => s.Length <= contextSize);
- public static IEnumerable