Skip to content

Commit 55e5af9

Browse files
authored
Merge pull request #2503 from keboola/vb/DMD-942/llm-export-polish-performance
DMD-942 - Add progress reporting and polish for llm export
2 parents de562df + 03e5a09 commit 55e5af9

7 files changed

Lines changed: 425 additions & 62 deletions

File tree

CLAUDE.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,13 @@ func buildUID(prefix, name string) string {
227227
- **Observability**: Structured logging; OpenTelemetry integration; metrics for critical paths
228228
- **Early returns**: Prefer early `return` / `continue` to reduce nesting
229229
- **Default-first assignment**: Set default value first, then override if present. Avoid resetting nil values after assignment.
230+
- **Deterministic output**: When iterating over maps, always sort results before output to ensure consistent ordering across runs
231+
- **Avoid else-if chains**: Prefer early returns, switch statements, or restructured logic
232+
- **Type assertions with nil checks**: After type assertions, always check for nil before using the value
233+
- **nolint directives**: Use `//nolint:directive` format (no space after //)
234+
- **Parser/processor pattern**: Use struct with logger as constructor dependency for components that need logging; keep utility functions that don't need logger as standalone
230235

231-
Example - instead of:
236+
Example of default-first assignment - instead of:
232237
```go
233238
value := someMap[key]
234239
if value == nil {
@@ -258,6 +263,11 @@ if items == nil {
258263
}
259264
```
260265

266+
### Commit Practices
267+
- **Make small, focused commits** - One logical change per commit
268+
- **Don't bundle multiple tasks** into a single commit; split them for easier review and rebasing
269+
- **Example**: If implementing column metadata, transformation code output, and component configs, create 3 separate commits rather than one large commit
270+
261271
### Testing
262272
- Test files use `*_test.go` suffix and are located next to implementation
263273
- Use `testify/assert` for assertions

internal/pkg/llm/twinformat/configparser/parser.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -344,21 +344,25 @@ func ParseComponentConfig(comp *keboola.ComponentWithConfigs, cfg *keboola.Confi
344344
Created: cfg.Created.String(),
345345
}
346346

347+
if cfg.Content == nil {
348+
return config
349+
}
350+
347351
// Parse storage.output.tables for output mappings (applications/extractors write to tables too)
348-
if cfg.Content != nil {
349-
if storage, ok := cfg.Content.Get("storage"); ok {
350-
if storageMap := toStringMap(storage); storageMap != nil {
351-
config.OutputTables = ParseStorageMappings(storageMap, "output")
352-
}
352+
if storage, ok := cfg.Content.Get("storage"); ok {
353+
if storageMap := toStringMap(storage); storageMap != nil {
354+
config.OutputTables = ParseStorageMappings(storageMap, "output")
353355
}
356+
}
354357

355-
// Convert configuration content to map
356-
config.Configuration = make(map[string]any)
357-
for _, key := range cfg.Content.Keys() {
358-
if val, ok := cfg.Content.Get(key); ok {
359-
config.Configuration[key] = val
360-
}
358+
// Convert configuration content to map
359+
config.Configuration = make(map[string]any)
360+
for _, key := range cfg.Content.Keys() {
361+
val, ok := cfg.Content.Get(key)
362+
if !ok {
363+
continue
361364
}
365+
config.Configuration[key] = val
362366
}
363367

364368
return config

internal/pkg/llm/twinformat/generator.go

Lines changed: 169 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77
"sort"
88
"time"
99

10+
"github.com/keboola/keboola-sdk-go/v2/pkg/keboola"
11+
1012
"github.com/keboola/keboola-as-code/internal/pkg/filesystem"
1113
"github.com/keboola/keboola-as-code/internal/pkg/llm/twinformat/templates"
1214
"github.com/keboola/keboola-as-code/internal/pkg/llm/twinformat/writer"
@@ -95,6 +97,10 @@ func (g *Generator) Generate(ctx context.Context, data *ProcessedData) error {
9597
return errors.Errorf("failed to generate AI guide: %w", err)
9698
}
9799

100+
if err := g.generateDataDictionary(ctx, data); err != nil {
101+
return errors.Errorf("failed to generate data dictionary: %w", err)
102+
}
103+
98104
g.logger.Infof(ctx, "Twin format output generated successfully")
99105
return nil
100106
}
@@ -276,35 +282,10 @@ func (g *Generator) generateTableMetadata(ctx context.Context, table *ProcessedT
276282
// buildColumnDetails builds detailed column information including metadata.
277283
func (g *Generator) buildColumnDetails(table *ProcessedTable) []map[string]any {
278284
columns := make([]map[string]any, 0, len(table.Columns))
279-
280285
for _, colName := range table.Columns {
281-
col := map[string]any{
282-
"name": colName,
283-
}
284-
285-
// Extract column metadata if available
286-
if table.Table != nil && table.ColumnMetadata != nil {
287-
if colMeta, ok := table.ColumnMetadata[colName]; ok {
288-
for _, meta := range colMeta {
289-
switch meta.Key {
290-
case "KBC.datatype.basetype":
291-
col["base_type"] = meta.Value
292-
case "KBC.datatype.type":
293-
col["type"] = meta.Value
294-
case "KBC.datatype.nullable":
295-
col["nullable"] = meta.Value == "1" || meta.Value == "true"
296-
case "KBC.datatype.length":
297-
col["length"] = meta.Value
298-
case "KBC.description":
299-
col["description"] = meta.Value
300-
}
301-
}
302-
}
303-
}
304-
286+
col := buildColumnDetailEntry(colName, table.ColumnMetadata)
305287
columns = append(columns, col)
306288
}
307-
308289
return columns
309290
}
310291

@@ -835,6 +816,11 @@ func (g *Generator) generateSourcesIndex(ctx context.Context, data *ProcessedDat
835816
docFields := SourcesIndexDocFields()
836817
sources := buildSourcesList(data.Buckets)
837818

819+
// Sort sources by ID for deterministic output.
820+
sort.Slice(sources, func(i, j int) bool {
821+
return sources[i]["id"].(string) < sources[j]["id"].(string)
822+
})
823+
838824
sourcesIndex := map[string]any{
839825
"_comment": docFields.Comment,
840826
"_purpose": docFields.Purpose,
@@ -878,6 +864,8 @@ func buildSourcesList(buckets []*ProcessedBucket) []map[string]any {
878864

879865
sources := make([]map[string]any, 0, len(sourceMap))
880866
for _, info := range sourceMap {
867+
// Sort buckets for deterministic output.
868+
sort.Strings(info.Buckets)
881869
sources = append(sources, map[string]any{
882870
"id": info.ID,
883871
"name": info.Name,
@@ -890,6 +878,60 @@ func buildSourcesList(buckets []*ProcessedBucket) []map[string]any {
890878
return sources
891879
}
892880

881+
// buildColumnDetailEntry builds a detailed column entry map for table metadata.
882+
// Extracts all available metadata fields (base_type, type, nullable, length, description).
883+
func buildColumnDetailEntry(colName string, columnMetadata keboola.ColumnsMetadata) map[string]any {
884+
col := map[string]any{"name": colName}
885+
if columnMetadata == nil {
886+
return col
887+
}
888+
889+
colMeta, ok := columnMetadata[colName]
890+
if !ok {
891+
return col
892+
}
893+
894+
for _, meta := range colMeta {
895+
switch meta.Key {
896+
case "KBC.datatype.basetype":
897+
col["base_type"] = meta.Value
898+
case "KBC.datatype.type":
899+
col["type"] = meta.Value
900+
case "KBC.datatype.nullable":
901+
col["nullable"] = meta.Value == "1" || meta.Value == "true"
902+
case "KBC.datatype.length":
903+
col["length"] = meta.Value
904+
case "KBC.description":
905+
col["description"] = meta.Value
906+
}
907+
}
908+
return col
909+
}
910+
911+
// buildColumnEntry builds a column entry map for data dictionary.
912+
// Extracts type and description from column metadata if available.
913+
func buildColumnEntry(colName string, columnMetadata keboola.ColumnsMetadata) map[string]any {
914+
col := map[string]any{"name": colName}
915+
if columnMetadata == nil {
916+
return col
917+
}
918+
919+
colMeta, ok := columnMetadata[colName]
920+
if !ok {
921+
return col
922+
}
923+
924+
for _, meta := range colMeta {
925+
switch meta.Key {
926+
case "KBC.datatype.basetype":
927+
col["type"] = meta.Value
928+
case "KBC.description":
929+
col["description"] = meta.Value
930+
}
931+
}
932+
return col
933+
}
934+
893935
// formatSourceName converts a source ID to a human-readable name.
894936
func formatSourceName(source string) string {
895937
names := map[string]string{
@@ -1050,7 +1092,11 @@ func (g *Generator) generateMostConnectedNodes(ctx context.Context, data *Proces
10501092
}
10511093

10521094
sort.Slice(nodes, func(i, j int) bool {
1053-
return nodes[i].Connections > nodes[j].Connections
1095+
if nodes[i].Connections != nodes[j].Connections {
1096+
return nodes[i].Connections > nodes[j].Connections
1097+
}
1098+
// Secondary sort by UID for deterministic order when connections are equal.
1099+
return nodes[i].UID < nodes[j].UID
10541100
})
10551101

10561102
// Limit to top 20 nodes.
@@ -1105,6 +1151,11 @@ func (g *Generator) generateManifestExtended(ctx context.Context, data *Processe
11051151
docFields := ManifestExtendedDocFields()
11061152
sources := buildSourcesList(data.Buckets)
11071153

1154+
// Sort sources by ID for deterministic output.
1155+
sort.Slice(sources, func(i, j int) bool {
1156+
return sources[i]["id"].(string) < sources[j]["id"].(string)
1157+
})
1158+
11081159
// Build platform counts.
11091160
platformCounts := make(map[string]int)
11101161
for _, transform := range data.Transformations {
@@ -1209,6 +1260,97 @@ func (g *Generator) generateAIGuide(ctx context.Context, _ *ProcessedData) error
12091260
return nil
12101261
}
12111262

1263+
// generateDataDictionary generates a data dictionary from the exported data.
1264+
func (g *Generator) generateDataDictionary(ctx context.Context, data *ProcessedData) error {
1265+
g.logger.Debugf(ctx, "Generating data dictionary")
1266+
1267+
// Create documentation directory.
1268+
docDir := filesystem.Join(g.outputDir, "documentation")
1269+
if err := g.fs.Mkdir(ctx, docDir); err != nil {
1270+
return errors.Errorf("failed to create documentation directory: %w", err)
1271+
}
1272+
1273+
// Build tables section from actual data.
1274+
tables := make(map[string]map[string]any)
1275+
for _, table := range data.Tables {
1276+
tableEntry := map[string]any{
1277+
"name": table.Name,
1278+
"bucket": table.BucketName,
1279+
"source": table.Source,
1280+
"rows_count": table.RowsCount,
1281+
"column_count": len(table.Columns),
1282+
}
1283+
1284+
if table.DisplayName != "" {
1285+
tableEntry["display_name"] = table.DisplayName
1286+
}
1287+
1288+
// Add column details.
1289+
columns := make([]map[string]any, 0, len(table.Columns))
1290+
for _, colName := range table.Columns {
1291+
col := buildColumnEntry(colName, table.ColumnMetadata)
1292+
columns = append(columns, col)
1293+
}
1294+
tableEntry["columns"] = columns
1295+
1296+
tables[table.UID] = tableEntry
1297+
}
1298+
1299+
// Build transformations section.
1300+
transformations := make(map[string]map[string]any)
1301+
for _, transform := range data.Transformations {
1302+
entry := map[string]any{
1303+
"name": transform.Name,
1304+
"platform": transform.Platform,
1305+
"is_disabled": transform.IsDisabled,
1306+
}
1307+
if transform.Description != "" {
1308+
entry["description"] = transform.Description
1309+
}
1310+
if transform.Dependencies != nil {
1311+
entry["inputs"] = transform.Dependencies.Consumes
1312+
entry["outputs"] = transform.Dependencies.Produces
1313+
}
1314+
transformations[transform.UID] = entry
1315+
}
1316+
1317+
// Build components section.
1318+
components := make(map[string]map[string]any)
1319+
for _, config := range data.ComponentConfigs {
1320+
entry := map[string]any{
1321+
"name": config.Name,
1322+
"component_id": config.ComponentID,
1323+
"component_type": config.ComponentType,
1324+
"is_disabled": config.IsDisabled,
1325+
}
1326+
if config.Description != "" {
1327+
entry["description"] = config.Description
1328+
}
1329+
components[config.ID] = entry
1330+
}
1331+
1332+
// Build the data dictionary.
1333+
dictionary := map[string]any{
1334+
"_comment": "Auto-generated data dictionary from Keboola project",
1335+
"_purpose": "Comprehensive reference of all tables, transformations, and components",
1336+
"_update_frequency": "Generated on each export",
1337+
"project_id": data.ProjectID.String(),
1338+
"generated_at": data.ProcessedAt.UTC().Format(time.RFC3339),
1339+
"summary": map[string]any{
1340+
"total_tables": len(data.Tables),
1341+
"total_transformations": len(data.Transformations),
1342+
"total_components": len(data.ComponentConfigs),
1343+
"total_buckets": len(data.Buckets),
1344+
},
1345+
"tables": tables,
1346+
"transformations": transformations,
1347+
"components": components,
1348+
}
1349+
1350+
dictPath := filesystem.Join(docDir, "data-dictionary.json")
1351+
return g.jsonWriter.Write(ctx, dictPath, dictionary)
1352+
}
1353+
12121354
// GenerateSamples generates sample CSV files for tables.
12131355
func (g *Generator) GenerateSamples(ctx context.Context, data *ProcessedData, samples []*TableSample) error {
12141356
g.logger.Infof(ctx, "Generating samples for %d tables", len(samples))

0 commit comments

Comments
 (0)