@@ -1386,6 +1386,12 @@ func (g *Generator) generateSampleFile(ctx context.Context, sample *TableSample)
13861386 return nil
13871387}
13881388
1389+ // columnStats holds quality metrics for a single column.
1390+ type columnStats struct {
1391+ nullCount int
1392+ distinctVals map [string ]struct {}
1393+ }
1394+
13891395// calculateDataQuality computes data quality metrics for a sample.
13901396// Returns completeness (percentage of non-null values), null_counts, distinct_counts, and sample_size.
13911397func (g * Generator ) calculateDataQuality (sample * TableSample ) map [string ]any {
@@ -1398,42 +1404,36 @@ func (g *Generator) calculateDataQuality(sample *TableSample) map[string]any {
13981404 }
13991405 }
14001406
1401- // Initialize counters for each column.
1402- nullCounts := make (map [string ]int )
1403- distinctSets := make (map [string ]map [string ]struct {})
1407+ // Initialize stats for each column.
1408+ stats := make (map [string ]* columnStats , len (sample .Columns ))
14041409 for _ , col := range sample .Columns {
1405- nullCounts [col ] = 0
1406- distinctSets [col ] = make (map [string ]struct {})
1410+ stats [col ] = & columnStats {distinctVals : make (map [string ]struct {})}
14071411 }
14081412
14091413 // Process each row.
14101414 for _ , row := range sample .Rows {
14111415 for i , col := range sample .Columns {
1412- if i >= len (row ) {
1413- nullCounts [col ]++
1414- continue
1415- }
1416- value := row [i ]
1417- if value == "" {
1418- nullCounts [col ]++
1416+ colStats := stats [col ]
1417+ if i >= len (row ) || row [i ] == "" {
1418+ colStats .nullCount ++
14191419 } else {
1420- distinctSets [ col ][ value ] = struct {}{}
1420+ colStats. distinctVals [ row [ i ] ] = struct {}{}
14211421 }
14221422 }
14231423 }
14241424
14251425 // Calculate completeness and distinct counts.
1426- completeness := make (map [string ]int )
1427- distinctCounts := make (map [string ]int )
1426+ completeness := make (map [string ]int , len (sample .Columns ))
1427+ nullCounts := make (map [string ]int , len (sample .Columns ))
1428+ distinctCounts := make (map [string ]int , len (sample .Columns ))
14281429 for _ , col := range sample .Columns {
1429- nullCount := nullCounts [col ]
1430- nonNullCount := sample .RowCount () - nullCount
1430+ colStats := stats [col ]
1431+ nullCounts [col ] = colStats .nullCount
1432+ distinctCounts [col ] = len (colStats .distinctVals )
1433+ nonNullCount := sample .RowCount () - colStats .nullCount
14311434 if sample .RowCount () > 0 {
14321435 completeness [col ] = (nonNullCount * 100 ) / sample .RowCount ()
1433- } else {
1434- completeness [col ] = 0
14351436 }
1436- distinctCounts [col ] = len (distinctSets [col ])
14371437 }
14381438
14391439 return map [string ]any {
0 commit comments