@@ -278,8 +278,11 @@ private AnalysisResult runWithCache(Path root, Integer parallelism, AnalysisCach
278278 DetectorResult result = analyzeFile (file , root , detectorRegistry );
279279 resultSlots [idx ] = result ;
280280 if (result != null && (!result .nodes ().isEmpty () || !result .edges ().isEmpty ())) {
281+ FileClassifier .FileType ft = FileClassifier .classify (file .path (), file .language ());
282+ String snippet = computeSnippetFromFile (root .resolve (file .path ()), ft );
281283 cacheRef .storeResults (hash , file .path ().toString (), file .language (),
282- result .nodes (), result .edges ());
284+ result .nodes (), result .edges (), "DETECTED" , "antlr" ,
285+ ft .name ().toLowerCase (), snippet );
283286 }
284287 } catch (IOException e ) {
285288 log .debug ("Could not hash file {}" , file .path (), e );
@@ -583,8 +586,11 @@ private AnalysisResult runBatchedWithCache(Path root, Integer parallelism, int b
583586 DetectorResult result = analyzeFile (file , root , detectorRegistry );
584587 resultSlots [idx ] = result ;
585588 if (result != null && (!result .nodes ().isEmpty () || !result .edges ().isEmpty ())) {
589+ FileClassifier .FileType ft = FileClassifier .classify (file .path (), file .language ());
590+ String snippet = computeSnippetFromFile (root .resolve (file .path ()), ft );
586591 cache .storeResults (hash , file .path ().toString (), file .language (),
587- result .nodes (), result .edges ());
592+ result .nodes (), result .edges (), "DETECTED" , "antlr" ,
593+ ft .name ().toLowerCase (), snippet );
588594 }
589595 } catch (IOException e ) {
590596 log .debug ("Could not hash file {}" , file .path (), e );
@@ -1002,8 +1008,13 @@ private int[] processSmartBatch(
10021008 DetectorResult result = analyzeFileWithRegistry (file , root , detectorRegistry , infraRegistry , cachedContent );
10031009 slots [idx ] = result ;
10041010 if (result != null && (!result .nodes ().isEmpty () || !result .edges ().isEmpty ())) {
1011+ FileClassifier .FileType ft = FileClassifier .classify (file .path (), file .language ());
1012+ String snippet = cachedContent != null
1013+ ? computeSnippet (cachedContent , ft )
1014+ : computeSnippetFromFile (root .resolve (file .path ()), ft );
10051015 cache .storeResults (hash , file .path ().toString (), file .language (),
1006- result .nodes (), result .edges ());
1016+ result .nodes (), result .edges (), "DETECTED" , "antlr" ,
1017+ ft .name ().toLowerCase (), snippet );
10071018 }
10081019 } catch (IOException e ) {
10091020 log .debug ("Could not hash {}" , file .path (), e );
@@ -1183,6 +1194,19 @@ DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
11831194 String preReadContent ) {
11841195 Instant fileStart = Instant .now ();
11851196
1197+ // Classify file type before reading content
1198+ FileClassifier .FileType fileType = FileClassifier .classify (file .path (), file .language ());
1199+
1200+ // Binary files: inventory-only node, no content read needed
1201+ if (fileType == FileClassifier .FileType .BINARY ) {
1202+ return createInventoryNode (file , "binary" );
1203+ }
1204+
1205+ // Generated files: inventory-only node, skip detectors
1206+ if (fileType == FileClassifier .FileType .GENERATED ) {
1207+ return createInventoryNode (file , "generated" );
1208+ }
1209+
11861210 String content ;
11871211 if (preReadContent != null ) {
11881212 content = preReadContent ;
@@ -1197,6 +1221,16 @@ DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
11971221 }
11981222 }
11991223
1224+ // Test files: inventory-only node with file_type=test
1225+ if (fileType == FileClassifier .FileType .TEST ) {
1226+ return createInventoryNode (file , "test" );
1227+ }
1228+
1229+ // Text files (unknown language): inventory-only node
1230+ if (fileType == FileClassifier .FileType .TEXT ) {
1231+ return createInventoryNode (file , "text" );
1232+ }
1233+
12001234 if (isMinified (file , content )) {
12011235 log .debug ("Skipping detectors for minified file: {}" , file .path ());
12021236 String moduleName = DetectorUtils .deriveModuleName (file .path ().toString (), file .language ());
@@ -1206,10 +1240,13 @@ DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
12061240 file .path ().getFileName ().toString ());
12071241 node .setFilePath (file .path ().toString ());
12081242 node .setModule (moduleName );
1209- node .setProperties (new java .util .LinkedHashMap <>(Map .of ("minified" , true )));
1243+ node .setProperties (new java .util .LinkedHashMap <>(Map .of ("minified" , true , "file_type" , "minified" )));
12101244 return DetectorResult .of (List .of (node ), List .of ());
12111245 }
12121246
1247+ // SOURCE and CONFIG files: run detectors
1248+ String fileTypeStr = (fileType == FileClassifier .FileType .CONFIG ) ? "config" : "source" ;
1249+
12131250 Object parsedData = null ;
12141251 if (STRUCTURED_LANGUAGES .contains (file .language ())) {
12151252 parsedData = parser .parse (file .language (), content , file .path ().toString ());
@@ -1258,12 +1295,12 @@ DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
12581295 log .info ("🐢 SLOW: {} took {}ms" , file .path (), fileMs );
12591296 }
12601297
1261- if (moduleName != null ) {
1262- for (CodeNode node : allNodes ) {
1263- if (node .getModule () == null || node .getModule ().isEmpty ()) {
1264- node .setModule (moduleName );
1265- }
1298+ // Set module and file_type on all nodes
1299+ for (CodeNode node : allNodes ) {
1300+ if (moduleName != null && (node .getModule () == null || node .getModule ().isEmpty ())) {
1301+ node .setModule (moduleName );
12661302 }
1303+ node .getProperties ().put ("file_type" , fileTypeStr );
12671304 }
12681305
12691306 return DetectorResult .of (allNodes , allEdges );
@@ -1359,6 +1396,19 @@ DetectorResult analyzeFile(DiscoveredFile file, Path repoPath, DetectorRegistry
13591396 Instant fileStart = Instant .now ();
13601397 Path absPath = repoPath .resolve (file .path ());
13611398
1399+ // Classify file type before reading content
1400+ FileClassifier .FileType fileType = FileClassifier .classify (file .path (), file .language ());
1401+
1402+ // Binary files: inventory-only node, no content read needed
1403+ if (fileType == FileClassifier .FileType .BINARY ) {
1404+ return createInventoryNode (file , "binary" );
1405+ }
1406+
1407+ // Generated files: inventory-only node, skip detectors
1408+ if (fileType == FileClassifier .FileType .GENERATED ) {
1409+ return createInventoryNode (file , "generated" );
1410+ }
1411+
13621412 // Read file content
13631413 String content ;
13641414 try {
@@ -1369,6 +1419,16 @@ DetectorResult analyzeFile(DiscoveredFile file, Path repoPath, DetectorRegistry
13691419 return DetectorResult .empty ();
13701420 }
13711421
1422+ // Test files: inventory-only node with file_type=test
1423+ if (fileType == FileClassifier .FileType .TEST ) {
1424+ return createInventoryNode (file , "test" );
1425+ }
1426+
1427+ // Text files (unknown language): inventory-only node
1428+ if (fileType == FileClassifier .FileType .TEXT ) {
1429+ return createInventoryNode (file , "text" );
1430+ }
1431+
13721432 // Minified file detection: create a node with minified=true but skip detectors
13731433 if (isMinified (file , content )) {
13741434 log .debug ("Skipping detectors for minified file: {}" , file .path ());
@@ -1379,10 +1439,13 @@ DetectorResult analyzeFile(DiscoveredFile file, Path repoPath, DetectorRegistry
13791439 file .path ().getFileName ().toString ());
13801440 node .setFilePath (file .path ().toString ());
13811441 node .setModule (moduleName );
1382- node .setProperties (new java .util .LinkedHashMap <>(Map .of ("minified" , true )));
1442+ node .setProperties (new java .util .LinkedHashMap <>(Map .of ("minified" , true , "file_type" , "minified" )));
13831443 return DetectorResult .of (List .of (node ), List .of ());
13841444 }
13851445
1446+ // SOURCE and CONFIG files: run detectors
1447+ String fileTypeStr = (fileType == FileClassifier .FileType .CONFIG ) ? "config" : "source" ;
1448+
13861449 // Parse structured data if applicable
13871450 Object parsedData = null ;
13881451 if (STRUCTURED_LANGUAGES .contains (file .language ())) {
@@ -1440,18 +1503,34 @@ DetectorResult analyzeFile(DiscoveredFile file, Path repoPath, DetectorRegistry
14401503 log .info ("🐢 SLOW: {} took {}ms" , file .path (), fileMs );
14411504 }
14421505
1443- // Set module on all nodes that don't have one yet
1444- if (moduleName != null ) {
1445- for (CodeNode node : allNodes ) {
1446- if (node .getModule () == null || node .getModule ().isEmpty ()) {
1447- node .setModule (moduleName );
1448- }
1506+ // Set module and file_type on all nodes
1507+ for (CodeNode node : allNodes ) {
1508+ if (moduleName != null && (node .getModule () == null || node .getModule ().isEmpty ())) {
1509+ node .setModule (moduleName );
14491510 }
1511+ node .getProperties ().put ("file_type" , fileTypeStr );
14501512 }
14511513
14521514 return DetectorResult .of (allNodes , allEdges );
14531515 }
14541516
1517+ /**
1518+ * Create an inventory-only node for files that should not have detectors run.
1519+ */
1520+ private static DetectorResult createInventoryNode (DiscoveredFile file , String fileType ) {
1521+ String moduleName = DetectorUtils .deriveModuleName (file .path ().toString (), file .language ());
1522+ CodeNode node = new CodeNode (
1523+ "file:" + file .path () + ":module:" + (moduleName != null ? moduleName : file .path ().getFileName ().toString ()),
1524+ NodeKind .MODULE ,
1525+ file .path ().getFileName ().toString ());
1526+ node .setFilePath (file .path ().toString ());
1527+ node .setModule (moduleName );
1528+ node .setProperties (new java .util .LinkedHashMap <>(Map .of (
1529+ "file_type" , fileType ,
1530+ "language" , file .language () != null ? file .language () : "" )));
1531+ return DetectorResult .of (List .of (node ), List .of ());
1532+ }
1533+
14551534 /**
14561535 * Regex-only analysis fallback for files where ANTLR timed out.
14571536 * Ensures zero data loss — every file produces nodes via regex detection.
@@ -1544,6 +1623,38 @@ private String getGitHead(Path repoPath) {
15441623 return null ;
15451624 }
15461625
1626+ /**
1627+ * Read file content and compute snippet. Returns null on error or for binary files.
1628+ */
1629+ private static String computeSnippetFromFile (Path absPath , FileClassifier .FileType fileType ) {
1630+ if (fileType == FileClassifier .FileType .BINARY ) return null ;
1631+ try {
1632+ byte [] raw = Files .readAllBytes (absPath );
1633+ String content = DetectorUtils .decodeContent (raw );
1634+ return computeSnippet (content , fileType );
1635+ } catch (IOException e ) {
1636+ return null ;
1637+ }
1638+ }
1639+
1640+ /**
1641+ * Compute a snippet from file content for storage in H2.
1642+ * Returns the first 200 lines, capped at 10KB, or null for binary files.
1643+ */
1644+ static String computeSnippet (String content , FileClassifier .FileType fileType ) {
1645+ if (fileType == FileClassifier .FileType .BINARY ) return null ;
1646+ if (content == null || content .isEmpty ()) return null ;
1647+ // First 200 lines, max 10KB
1648+ String [] lines = content .split ("\n " , 201 );
1649+ StringBuilder sb = new StringBuilder ();
1650+ for (int i = 0 ; i < Math .min (lines .length , 200 ); i ++) {
1651+ if (sb .length () + lines [i ].length () > 10_000 ) break ;
1652+ if (i > 0 ) sb .append ('\n' );
1653+ sb .append (lines [i ]);
1654+ }
1655+ return sb .toString ();
1656+ }
1657+
15471658 /**
15481659 * Pre-compile exclude glob patterns into regex Pattern objects.
15491660 */
0 commit comments