Skip to content

Commit 49f929a

Browse files
vojtabiberleclaude
andcommitted
DMD-921 - Refactor calculateDataQuality to use columnStats struct
Replace nested map[string]map[string]struct{} with a columnStats struct that groups nullCount and distinctVals together. This is more idiomatic and makes the code's intent clearer. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 04f48b4 commit 49f929a

1 file changed

Lines changed: 20 additions & 20 deletions

File tree

internal/pkg/llm/twinformat/generator.go

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,12 @@ func (g *Generator) generateSampleFile(ctx context.Context, sample *TableSample)
13861386
return nil
13871387
}
13881388

1389+
// columnStats holds quality metrics for a single column.
1390+
type columnStats struct {
1391+
nullCount int
1392+
distinctVals map[string]struct{}
1393+
}
1394+
13891395
// calculateDataQuality computes data quality metrics for a sample.
13901396
// Returns completeness (percentage of non-null values), null_counts, distinct_counts, and sample_size.
13911397
func (g *Generator) calculateDataQuality(sample *TableSample) map[string]any {
@@ -1398,42 +1404,36 @@ func (g *Generator) calculateDataQuality(sample *TableSample) map[string]any {
13981404
}
13991405
}
14001406

1401-
// Initialize counters for each column.
1402-
nullCounts := make(map[string]int)
1403-
distinctSets := make(map[string]map[string]struct{})
1407+
// Initialize stats for each column.
1408+
stats := make(map[string]*columnStats, len(sample.Columns))
14041409
for _, col := range sample.Columns {
1405-
nullCounts[col] = 0
1406-
distinctSets[col] = make(map[string]struct{})
1410+
stats[col] = &columnStats{distinctVals: make(map[string]struct{})}
14071411
}
14081412

14091413
// Process each row.
14101414
for _, row := range sample.Rows {
14111415
for i, col := range sample.Columns {
1412-
if i >= len(row) {
1413-
nullCounts[col]++
1414-
continue
1415-
}
1416-
value := row[i]
1417-
if value == "" {
1418-
nullCounts[col]++
1416+
colStats := stats[col]
1417+
if i >= len(row) || row[i] == "" {
1418+
colStats.nullCount++
14191419
} else {
1420-
distinctSets[col][value] = struct{}{}
1420+
colStats.distinctVals[row[i]] = struct{}{}
14211421
}
14221422
}
14231423
}
14241424

14251425
// Calculate completeness and distinct counts.
1426-
completeness := make(map[string]int)
1427-
distinctCounts := make(map[string]int)
1426+
completeness := make(map[string]int, len(sample.Columns))
1427+
nullCounts := make(map[string]int, len(sample.Columns))
1428+
distinctCounts := make(map[string]int, len(sample.Columns))
14281429
for _, col := range sample.Columns {
1429-
nullCount := nullCounts[col]
1430-
nonNullCount := sample.RowCount() - nullCount
1430+
colStats := stats[col]
1431+
nullCounts[col] = colStats.nullCount
1432+
distinctCounts[col] = len(colStats.distinctVals)
1433+
nonNullCount := sample.RowCount() - colStats.nullCount
14311434
if sample.RowCount() > 0 {
14321435
completeness[col] = (nonNullCount * 100) / sample.RowCount()
1433-
} else {
1434-
completeness[col] = 0
14351436
}
1436-
distinctCounts[col] = len(distinctSets[col])
14371437
}
14381438

14391439
return map[string]any{

0 commit comments

Comments
 (0)