-
Notifications
You must be signed in to change notification settings - Fork 862
Introduce HeaderChunker #6979
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Introduce HeaderChunker #6979
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
8433c5f
move existing code as-is
adamsitnik a6535a9
solve the warnings and errors
adamsitnik 2c8fdca
some improvements after reading the code again
adamsitnik 30bbdbe
address code review feedback
adamsitnik 98e9b56
address code review feedback
adamsitnik f20c320
address code review feedback:
adamsitnik File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
273 changes: 273 additions & 0 deletions
273
src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/ElementsChunker.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,273 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.Text; | ||
| using Microsoft.ML.Tokenizers; | ||
| using Microsoft.Shared.Diagnostics; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion.Chunkers; | ||
|
|
||
| #pragma warning disable IDE0058 // Expression value is never used | ||
|
|
||
| internal sealed class ElementsChunker | ||
| { | ||
| private readonly Tokenizer _tokenizer; | ||
| private readonly int _maxTokensPerChunk; | ||
| private readonly StringBuilder _currentChunk; | ||
|
|
||
| internal ElementsChunker(IngestionChunkerOptions options) | ||
| { | ||
| _ = Throw.IfNull(options); | ||
|
|
||
| _tokenizer = options.Tokenizer; | ||
| _maxTokensPerChunk = options.MaxTokensPerChunk; | ||
|
|
||
| // Token count != character count, but StringBuilder will grow as needed. | ||
| _currentChunk = new(capacity: _maxTokensPerChunk); | ||
| } | ||
|
|
||
| // Goals: | ||
| // 1. Create chunks that do not exceed _maxTokensPerChunk when tokenized. | ||
| // 2. Maintain context in each chunk. | ||
| // 3. If a single IngestionDocumentElement exceeds _maxTokensPerChunk, it should be split intelligently (e.g., paragraphs can be split into sentences, tables into rows). | ||
| internal IEnumerable<IngestionChunk<string>> Process(IngestionDocument document, string context, List<IngestionDocumentElement> elements) | ||
| { | ||
| // Not using yield return here as we use ref structs. | ||
| List<IngestionChunk<string>> chunks = []; | ||
|
|
||
| int contextTokenCount = CountTokens(context.AsSpan()); | ||
| int totalTokenCount = contextTokenCount; | ||
|
|
||
| // If the context itself exceeds the max tokens per chunk, we can't do anything. | ||
| if (contextTokenCount >= _maxTokensPerChunk) | ||
| { | ||
| ThrowTokenCountExceeded(); | ||
| } | ||
|
|
||
| _currentChunk.Append(context); | ||
|
|
||
| for (int elementIndex = 0; elementIndex < elements.Count; elementIndex++) | ||
| { | ||
| IngestionDocumentElement element = elements[elementIndex]; | ||
| string? semanticContent = element switch | ||
| { | ||
| // Image exposes: | ||
| // - Markdown:  which is not very useful for embedding. | ||
| // - AlternativeText: usually a short description of the image, can be null or empty. It is usually less than 50 words. | ||
| // - Text: result of OCR, can be longer, but also can be null or empty. It can be several hundred words. | ||
| // We prefer AlternativeText over Text, as it is usually more relevant. | ||
| IngestionDocumentImage image => image.AlternativeText ?? image.Text, | ||
| _ => element.GetMarkdown() | ||
| }; | ||
|
|
||
| if (string.IsNullOrEmpty(semanticContent)) | ||
| { | ||
| continue; // An image can come with Markdown, but no AlternativeText or Text. | ||
| } | ||
|
|
||
| int elementTokenCount = CountTokens(semanticContent.AsSpan()); | ||
| if (elementTokenCount + totalTokenCount <= _maxTokensPerChunk) | ||
| { | ||
| totalTokenCount += elementTokenCount; | ||
| AppendNewLineAndSpan(_currentChunk, semanticContent.AsSpan()); | ||
| } | ||
| else if (element is IngestionDocumentTable table) | ||
| { | ||
| ValueStringBuilder tableBuilder = new(initialCapacity: 8000); | ||
|
adamsitnik marked this conversation as resolved.
|
||
|
|
||
| try | ||
| { | ||
| AddMarkdownTableRow(table, rowIndex: 0, ref tableBuilder); | ||
| AddMarkdownTableSeparatorRow(columnCount: table.Cells.GetLength(1), ref tableBuilder); | ||
|
|
||
| int headerLength = tableBuilder.Length; | ||
| int headerTokenCount = CountTokens(tableBuilder.AsSpan()); | ||
|
|
||
| // We can't respect the limit if context and header themselves use more tokens. | ||
| if (contextTokenCount + headerTokenCount >= _maxTokensPerChunk) | ||
| { | ||
| ThrowTokenCountExceeded(); | ||
| } | ||
|
|
||
| if (headerTokenCount + totalTokenCount >= _maxTokensPerChunk) | ||
| { | ||
| // We can't add the header row, so commit what we have accumulated so far. | ||
| Commit(); | ||
| } | ||
|
|
||
| totalTokenCount += headerTokenCount; | ||
| int tableLength = headerLength; | ||
|
|
||
| int rowCount = table.Cells.GetLength(0); | ||
| for (int rowIndex = 1; rowIndex < rowCount; rowIndex++) | ||
| { | ||
| AddMarkdownTableRow(table, rowIndex, ref tableBuilder); | ||
|
|
||
| int lastRowTokens = CountTokens(tableBuilder.AsSpan(tableLength)); | ||
|
|
||
| // Appending this row would exceed the limit. | ||
| if (totalTokenCount + lastRowTokens > _maxTokensPerChunk) | ||
| { | ||
| // We append the table as long as it's not just the header. | ||
| if (rowIndex != 1) | ||
| { | ||
| AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length)); | ||
| } | ||
|
|
||
| // And commit the table we built so far. | ||
| Commit(); | ||
|
|
||
| // Erase previous rows and keep only the header. | ||
| tableBuilder.Length = headerLength; | ||
| tableLength = headerLength; | ||
| totalTokenCount += headerTokenCount; | ||
|
|
||
| if (totalTokenCount + lastRowTokens > _maxTokensPerChunk) | ||
| { | ||
| // This row is simply too big even for a fresh chunk: | ||
| ThrowTokenCountExceeded(); | ||
| } | ||
|
|
||
| AddMarkdownTableRow(table, rowIndex, ref tableBuilder); | ||
| } | ||
|
|
||
| tableLength = tableBuilder.Length; | ||
| totalTokenCount += lastRowTokens; | ||
| } | ||
|
|
||
| AppendNewLineAndSpan(_currentChunk, tableBuilder.AsSpan(0, tableLength - Environment.NewLine.Length)); | ||
| } | ||
| finally | ||
| { | ||
| tableBuilder.Dispose(); | ||
| } | ||
| } | ||
| else | ||
| { | ||
| ReadOnlySpan<char> remainingContent = semanticContent.AsSpan(); | ||
|
|
||
| while (!remainingContent.IsEmpty) | ||
| { | ||
| int index = _tokenizer.GetIndexByTokenCount( | ||
| text: remainingContent, | ||
| maxTokenCount: _maxTokensPerChunk - totalTokenCount, | ||
| out string? normalizedText, | ||
| out int tokenCount, | ||
| considerNormalization: false); // We don't normalize, just append as-is to keep original content. | ||
|
|
||
| // some tokens fit | ||
| if (index > 0) | ||
| { | ||
| // We could try to split by sentences or other delimiters, but it's complicated. | ||
| // For simplicity, we will just split at the last new line that fits. | ||
| // Our promise is not to go over the max token count, not to create perfect chunks. | ||
| int newLineIndex = remainingContent.Slice(0, index).LastIndexOf('\n'); | ||
| if (newLineIndex > 0) | ||
| { | ||
| index = newLineIndex + 1; // We want to include the new line character (works for "\r\n" as well). | ||
| tokenCount = CountTokens(remainingContent.Slice(0, index)); | ||
| } | ||
|
|
||
| totalTokenCount += tokenCount; | ||
| ReadOnlySpan<char> spanToAppend = remainingContent.Slice(0, index); | ||
| AppendNewLineAndSpan(_currentChunk, spanToAppend); | ||
| remainingContent = remainingContent.Slice(index); | ||
| } | ||
| else if (totalTokenCount == contextTokenCount) | ||
| { | ||
| // We are at the beginning of a chunk, and even a single token does not fit. | ||
| ThrowTokenCountExceeded(); | ||
| } | ||
|
|
||
| if (!remainingContent.IsEmpty) | ||
| { | ||
| Commit(); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (totalTokenCount == _maxTokensPerChunk) | ||
| { | ||
| Commit(); | ||
| } | ||
| } | ||
|
|
||
| if (totalTokenCount > contextTokenCount) | ||
| { | ||
| chunks.Add(new(_currentChunk.ToString(), document, context)); | ||
| } | ||
|
|
||
| _currentChunk.Clear(); | ||
|
|
||
| return chunks; | ||
|
|
||
| void Commit() | ||
| { | ||
| chunks.Add(new(_currentChunk.ToString(), document, context)); | ||
|
|
||
| // We keep the context in the current chunk as it's the same for all elements. | ||
| _currentChunk.Remove( | ||
| startIndex: context.Length, | ||
| length: _currentChunk.Length - context.Length); | ||
|
adamsitnik marked this conversation as resolved.
|
||
| totalTokenCount = contextTokenCount; | ||
| } | ||
|
|
||
| static void ThrowTokenCountExceeded() | ||
| => throw new InvalidOperationException("Can't fit in the current chunk. Consider increasing max tokens per chunk."); | ||
| } | ||
|
|
||
| private static void AppendNewLineAndSpan(StringBuilder stringBuilder, ReadOnlySpan<char> chars) | ||
| { | ||
| // Don't start an empty chunk (no context provided) with a new line. | ||
| if (stringBuilder.Length > 0) | ||
| { | ||
| stringBuilder.AppendLine(); | ||
| } | ||
|
|
||
| #if NET | ||
| stringBuilder.Append(chars); | ||
| #else | ||
| stringBuilder.Append(chars.ToString()); | ||
|
stephentoub marked this conversation as resolved.
|
||
| #endif | ||
| } | ||
|
|
||
| private static void AddMarkdownTableRow(IngestionDocumentTable table, int rowIndex, ref ValueStringBuilder vsb) | ||
| { | ||
| for (int columnIndex = 0; columnIndex < table.Cells.GetLength(1); columnIndex++) | ||
| { | ||
| vsb.Append('|'); | ||
| vsb.Append(' '); | ||
| string? cellContent = table.Cells[rowIndex, columnIndex] switch | ||
| { | ||
| null => null, | ||
| IngestionDocumentImage img => img.AlternativeText ?? img.Text, | ||
| IngestionDocumentElement other => other.GetMarkdown() | ||
| }; | ||
| vsb.Append(cellContent); | ||
| vsb.Append(' '); | ||
| } | ||
|
|
||
| vsb.Append('|'); | ||
| vsb.Append(Environment.NewLine); | ||
| } | ||
|
|
||
| private static void AddMarkdownTableSeparatorRow(int columnCount, ref ValueStringBuilder vsb) | ||
| { | ||
| const int DashCount = 3; // The dash count does not need to match the header length. | ||
| for (int columnIndex = 0; columnIndex < columnCount; columnIndex++) | ||
| { | ||
| vsb.Append('|'); | ||
| vsb.Append(' '); | ||
| vsb.Append('-', DashCount); | ||
| vsb.Append(' '); | ||
| } | ||
|
|
||
| vsb.Append('|'); | ||
| vsb.Append(Environment.NewLine); | ||
| } | ||
|
|
||
| private int CountTokens(ReadOnlySpan<char> input) | ||
| => _tokenizer.CountTokens(input, considerNormalization: false); | ||
| } | ||
82 changes: 82 additions & 0 deletions
82
src/Libraries/Microsoft.Extensions.DataIngestion/Chunkers/HeaderChunker.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| // Licensed to the .NET Foundation under one or more agreements. | ||
| // The .NET Foundation licenses this file to you under the MIT license. | ||
|
|
||
| using System; | ||
| using System.Collections.Generic; | ||
| using System.Linq; | ||
| using System.Runtime.CompilerServices; | ||
| using System.Threading; | ||
| using Microsoft.Extensions.DataIngestion.Chunkers; | ||
| using Microsoft.Shared.Diagnostics; | ||
|
|
||
| namespace Microsoft.Extensions.DataIngestion; | ||
|
|
||
| /// <summary> | ||
| /// Splits documents into chunks based on headers and their corresponding levels, preserving the header context. | ||
| /// </summary> | ||
| public sealed class HeaderChunker : IngestionChunker<string> | ||
| { | ||
| private const int MaxHeaderLevel = 10; | ||
| private readonly ElementsChunker _elementsChunker; | ||
|
|
||
| /// <summary> | ||
| /// Initializes a new instance of the <see cref="HeaderChunker"/> class. | ||
| /// </summary> | ||
| /// <param name="options">The options for the chunker.</param> | ||
| public HeaderChunker(IngestionChunkerOptions options) | ||
| { | ||
| _elementsChunker = new(options); | ||
| } | ||
|
|
||
| /// <inheritdoc/> | ||
| public override async IAsyncEnumerable<IngestionChunk<string>> ProcessAsync(IngestionDocument document, | ||
| [EnumeratorCancellation] CancellationToken cancellationToken = default) | ||
| { | ||
| _ = Throw.IfNull(document); | ||
|
|
||
| List<IngestionDocumentElement> elements = []; | ||
| string?[] headers = new string?[MaxHeaderLevel + 1]; | ||
|
|
||
| foreach (IngestionDocumentElement element in document.EnumerateContent()) | ||
| { | ||
| cancellationToken.ThrowIfCancellationRequested(); | ||
|
|
||
| if (element is IngestionDocumentHeader header) | ||
| { | ||
| foreach (var chunk in SplitIntoChunks(document, headers, elements)) | ||
| { | ||
| yield return chunk; | ||
| } | ||
|
|
||
| int headerLevel = header.Level.GetValueOrDefault(); | ||
| headers[headerLevel] = header.GetMarkdown(); | ||
| headers.AsSpan(headerLevel + 1).Clear(); // clear all lower level headers | ||
|
|
||
| continue; // don't add headers to the elements list, they are part of the context | ||
| } | ||
|
|
||
| elements.Add(element); | ||
| } | ||
|
|
||
| // take care of any remaining paragraphs | ||
| foreach (var chunk in SplitIntoChunks(document, headers, elements)) | ||
| { | ||
| yield return chunk; | ||
| } | ||
| } | ||
|
|
||
| private IEnumerable<IngestionChunk<string>> SplitIntoChunks(IngestionDocument document, string?[] headers, List<IngestionDocumentElement> elements) | ||
| { | ||
| if (elements.Count > 0) | ||
| { | ||
| string chunkHeader = string.Join(" ", headers.Where(h => !string.IsNullOrEmpty(h))); | ||
|
|
||
| foreach (var chunk in _elementsChunker.Process(document, chunkHeader, elements)) | ||
| { | ||
| yield return chunk; | ||
| } | ||
|
|
||
| elements.Clear(); | ||
| } | ||
| } | ||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.