Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Directory.Build.targets
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@

<!-- Legacy targets do not support attributes for a nullable context thus suppressing null check warnings -->
<NoWarn Condition="'$(TargetFramework)' == 'net462' or '$(TargetFramework)' == 'netstandard2.0'">$(NoWarn);CA1062</NoWarn>

<!-- Async method lacks 'await' operators and will run synchronously (The C# compiler has already removed it) -->
<NoWarn>$(NoWarn);CS1998</NoWarn>
</PropertyGroup>

<ItemGroup Condition="('$(Stage)' == 'normal' OR '$(Stage)' == 'obsolete') AND '$(OutputType)' != 'Exe' AND '$(IsPackable)' == 'true' AND '$(Api)' != 'false'">
Expand Down
1 change: 1 addition & 0 deletions eng/packages/TestOnly.props
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
<PackageVersion Include="Microsoft.Diagnostics.Tracing.TraceEvent" Version="3.1.3" />
<PackageVersion Include="Microsoft.Extensions.Configuration.EnvironmentVariables" Version="9.0.0" />
<PackageVersion Include="Microsoft.Extensions.Configuration.UserSecrets" Version="9.0.0" />
<PackageVersion Include="Microsoft.ML.Tokenizers.Data.Cl100kBase" Version="$(MicrosoftMLTokenizersVersion)" />
<PackageVersion Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="$(MicrosoftMLTokenizersVersion)" />
<PackageVersion Include="Microsoft.SemanticKernel.Connectors.InMemory" Version="$(MicrosoftSemanticKernelConnectorsVersion)" />
<PackageVersion Include="Microsoft.SemanticKernel.Connectors.SqliteVec" Version="$(MicrosoftSemanticKernelConnectorsVersion)" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,21 @@ public IngestionDocumentHeader(string markdown)
/// <summary>
/// Gets or sets the level of the header.
/// </summary>
public int? Level { get; set; }
public int? Level
{
get => field;
set
{
if (value.HasValue)
{
field = Throw.IfOutOfRange(value.Value, min: 1, max: 10, nameof(value));
}
else
{
field = null;
}
}
}
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Text;
using Microsoft.ML.Tokenizers;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.DataIngestion.Chunkers;

#pragma warning disable IDE0058 // Expression value is never used

internal sealed class ElementsChunker
{
private readonly Tokenizer _tokenizer;
private readonly int _maxTokensPerChunk;
private readonly StringBuilder _currentChunk;

internal ElementsChunker(IngestionChunkerOptions options)
{
_ = Throw.IfNull(options);

_tokenizer = options.Tokenizer;
_maxTokensPerChunk = options.MaxTokensPerChunk;

// Token count != character count, but StringBuilder will grow as needed.
_currentChunk = new(capacity: _maxTokensPerChunk);
}

// Goals:
// 1. Create chunks that do not exceed _maxTokensPerChunk when tokenized.
// 2. Maintain context in each chunk.
// 3. If a single IngestionDocumentElement exceeds _maxTokensPerChunk, it should be split intelligently (e.g., paragraphs can be split into sentences, tables into rows).
internal IEnumerable<IngestionChunk<string>> Process(IngestionDocument document, string context, List<IngestionDocumentElement> elements)
{
// Not using yield return here as we use ref structs.
Comment thread
stephentoub marked this conversation as resolved.
List<IngestionChunk<string>> chunks = [];

int contextTokenCount = CountTokens(context.AsSpan());
int totalTokenCount = contextTokenCount;

// If the context itself exceeds the max tokens per chunk, we can't do anything.
if (contextTokenCount >= _maxTokensPerChunk)
{
ThrowTokenCountExceeded();
}

_currentChunk.Append(context);

for (int elementIndex = 0; elementIndex < elements.Count; elementIndex++)
{
IngestionDocumentElement element = elements[elementIndex];
string? semanticContent = element switch
{
// Image exposes:
// - Markdown: ![Alt Text](url) which is not very useful for embedding.
// - AlternativeText: usually a short description of the image, can be null or empty. It is usually less than 50 words.
// - Text: result of OCR, can be longer, but also can be null or empty. It can be several hundred words.
// We prefer AlternativeText over Text, as it is usually more relevant.
IngestionDocumentImage image => image.AlternativeText ?? image.Text,
_ => element.GetMarkdown()
};

if (string.IsNullOrEmpty(semanticContent))
{
continue; // An image can come with Markdown, but no AlternativeText or Text.
}

int elementTokenCount = CountTokens(semanticContent.AsSpan());
if (elementTokenCount + totalTokenCount <= _maxTokensPerChunk)
{
totalTokenCount += elementTokenCount;
AppendNewLineAndSpan(_currentChunk, semanticContent.AsSpan());
}
else if (element is IngestionDocumentTable table)
{
ValueStringBuilder tableBuilder = new(initialCapacity: 8000);
Comment thread
adamsitnik marked this conversation as resolved.

try
{
AddMarkdownTableRow(table, rowIndex: 0, ref tableBuilder);
AddMarkdownTableSeparatorRow(columnCount: table.Cells.GetLength(1), ref tableBuilder);

int headerLength = tableBuilder.Length;
int headerTokenCount = CountTokens(tableBuilder.AsSpan());

// We can't respect the limit if context and header themselves use more tokens.
if (contextTokenCount + headerTokenCount >= _maxTokensPerChunk)
{
ThrowTokenCountExceeded();
}

if (headerTokenCount + totalTokenCount >= _maxTokensPerChunk)
{
// We can't add the header row, so commit what we have accumulated so far.
Commit();
}

totalTokenCount += headerTokenCount;
int tableLength = headerLength;

int rowCount = table.Cells.GetLength(0);
for (int rowIndex = 1; rowIndex < rowCount; rowIndex++)
{
AddMarkdownTableRow(table, rowIndex, ref tableBuilder);

int lastRowTokens = CountTokens(tableBuilder.AsSpan(tableLength));

// Appending this row would exceed the limit.
if (totalTokenCount + lastRowTokens > _maxTokensPerChunk)
{
// We append the table as long as it's not just the header.
if (rowIndex != 1)
{
AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length));
}

// And commit the table we built so far.
Commit();

// Erase previous rows and keep only the header.
tableBuilder.Length = headerLength;
tableLength = headerLength;
totalTokenCount += headerTokenCount;

if (totalTokenCount + lastRowTokens > _maxTokensPerChunk)
{
// This row is simply too big even for a fresh chunk:
ThrowTokenCountExceeded();
}

AddMarkdownTableRow(table, rowIndex, ref tableBuilder);
}

tableLength = tableBuilder.Length;
totalTokenCount += lastRowTokens;
}

AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length));
}
finally
{
tableBuilder.Dispose();
}
}
else
{
ReadOnlySpan<char> remainingContent = semanticContent.AsSpan();

while (!remainingContent.IsEmpty)
{
int index = _tokenizer.GetIndexByTokenCount(
text: remainingContent,
maxTokenCount: _maxTokensPerChunk - totalTokenCount,
out string? normalizedText,
out int tokenCount,
considerNormalization: false); // We don't normalize, just append as-is to keep original content.

// some tokens fit
if (index > 0)
{
// We could try to split by sentences or other delimiters, but it's complicated.
// For simplicity, we will just split at the last new line that fits.
// Our promise is not to go over the max token count, not to create perfect chunks.
int newLineIndex = remainingContent.Slice(0, index).LastIndexOf('\n');
if (newLineIndex > 0)
{
index = newLineIndex + 1; // We want to include the new line character (works for "\r\n" as well).
tokenCount = CountTokens(remainingContent.Slice(0, index));
}

totalTokenCount += tokenCount;
ReadOnlySpan<char> spanToAppend = remainingContent.Slice(0, index);
AppendNewLineAndSpan(_currentChunk, spanToAppend);
remainingContent = remainingContent.Slice(index);
}
else if (totalTokenCount == contextTokenCount)
{
// We are at the beginning of a chunk, and even a single token does not fit.
ThrowTokenCountExceeded();
}

if (!remainingContent.IsEmpty)
{
Commit();
}
}
}

if (totalTokenCount == _maxTokensPerChunk)
{
Commit();
}
}

if (totalTokenCount > contextTokenCount)
{
chunks.Add(new(_currentChunk.ToString(), document, context));
}

_currentChunk.Clear();

return chunks;

void Commit()
{
chunks.Add(new(_currentChunk.ToString(), document, context));

// We keep the context in the current chunk as it's the same for all elements.
_currentChunk.Remove(
startIndex: context.Length,
length: _currentChunk.Length - context.Length);
Comment thread
adamsitnik marked this conversation as resolved.
totalTokenCount = contextTokenCount;
}

static void ThrowTokenCountExceeded()
=> throw new InvalidOperationException("Can't fit in the current chunk. Consider increasing max tokens per chunk.");
}

private static void AppendNewLineAndSpan(StringBuilder stringBuilder, ReadOnlySpan<char> chars)
{
// Don't start an empty chunk (no context provided) with a new line.
if (stringBuilder.Length > 0)
{
stringBuilder.AppendLine();
}

#if NET
stringBuilder.Append(chars);
#else
stringBuilder.Append(chars.ToString());
Comment thread
stephentoub marked this conversation as resolved.
#endif
}

private static void AddMarkdownTableRow(IngestionDocumentTable table, int rowIndex, ref ValueStringBuilder vsb)
{
for (int columnIndex = 0; columnIndex < table.Cells.GetLength(1); columnIndex++)
{
vsb.Append('|');
vsb.Append(' ');
string? cellContent = table.Cells[rowIndex, columnIndex] switch
{
null => null,
IngestionDocumentImage img => img.AlternativeText ?? img.Text,
IngestionDocumentElement other => other.GetMarkdown()
};
vsb.Append(cellContent);
vsb.Append(' ');
}

vsb.Append('|');
vsb.Append(Environment.NewLine);
}

private static void AddMarkdownTableSeparatorRow(int columnCount, ref ValueStringBuilder vsb)
{
const int DashCount = 3; // The dash count does not need to match the header length.
for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
{
vsb.Append('|');
vsb.Append(' ');
vsb.Append('-', DashCount);
vsb.Append(' ');
}

vsb.Append('|');
vsb.Append(Environment.NewLine);
}

private int CountTokens(ReadOnlySpan<char> input)
=> _tokenizer.CountTokens(input, considerNormalization: false);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
using Microsoft.Extensions.DataIngestion.Chunkers;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.DataIngestion;

/// <summary>
/// Splits documents into chunks based on headers and their corresponding levels, preserving the header context.
/// </summary>
public sealed class HeaderChunker : IngestionChunker<string>
{
private const int MaxHeaderLevel = 10;
private readonly ElementsChunker _elementsChunker;

/// <summary>
/// Initializes a new instance of the <see cref="HeaderChunker"/> class.
/// </summary>
/// <param name="options">The options for the chunker.</param>
public HeaderChunker(IngestionChunkerOptions options)
{
_elementsChunker = new(options);
}

/// <inheritdoc/>
public override async IAsyncEnumerable<IngestionChunk<string>> ProcessAsync(IngestionDocument document,
[EnumeratorCancellation] CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(document);

List<IngestionDocumentElement> elements = [];
string?[] headers = new string?[MaxHeaderLevel + 1];

foreach (IngestionDocumentElement element in document.EnumerateContent())
{
cancellationToken.ThrowIfCancellationRequested();

if (element is IngestionDocumentHeader header)
{
foreach (var chunk in SplitIntoChunks(document, headers, elements))
{
yield return chunk;
}

int headerLevel = header.Level.GetValueOrDefault();
headers[headerLevel] = header.GetMarkdown();
headers.AsSpan(headerLevel + 1).Clear(); // clear all lower level headers

continue; // don't add headers to the elements list, they are part of the context
}

elements.Add(element);
}

// take care of any remaining paragraphs
foreach (var chunk in SplitIntoChunks(document, headers, elements))
{
yield return chunk;
}
}

private IEnumerable<IngestionChunk<string>> SplitIntoChunks(IngestionDocument document, string?[] headers, List<IngestionDocumentElement> elements)
{
if (elements.Count > 0)
{
string chunkHeader = string.Join(" ", headers.Where(h => !string.IsNullOrEmpty(h)));

foreach (var chunk in _elementsChunker.Process(document, chunkHeader, elements))
{
yield return chunk;
}

elements.Clear();
}
}
}
Loading
Loading