@@ -300,7 +300,21 @@ private AnalysisResult runWithCache(Path root, Integer parallelism, AnalysisCach
300300 futures .get (i ).cancel (true );
301301 DiscoveredFile timedOutFile = files .get (i );
302302 log .warn ("⏱️ ANTLR timed out for {} (30s), running regex fallback" , timedOutFile .path ());
303- resultSlots [i ] = analyzeFileRegexOnly (timedOutFile , root , detectorRegistry );
303+ DetectorResult regexResult = analyzeFileRegexOnly (timedOutFile , root , detectorRegistry );
304+ resultSlots [i ] = regexResult ;
305+ // Store regex fallback result to cache with explicit detection_method
306+ if (cache != null && regexResult != null
307+ && (!regexResult .nodes ().isEmpty () || !regexResult .edges ().isEmpty ())) {
308+ try {
309+ Path absPath = root .resolve (timedOutFile .path ());
310+ String hash = FileHasher .hash (absPath );
311+ cache .storeResults (hash , timedOutFile .path ().toString (),
312+ timedOutFile .language (), regexResult .nodes (), regexResult .edges (),
313+ "DETECTED" , "regex_fallback" );
314+ } catch (IOException ioe ) {
315+ log .debug ("Could not hash for regex fallback cache: {}" , timedOutFile .path (), ioe );
316+ }
317+ }
304318 } catch (ExecutionException e ) {
305319 log .warn ("Analysis failed for {}" , files .get (i ).path (), e .getCause ());
306320 } catch (InterruptedException e ) {
@@ -592,7 +606,21 @@ private AnalysisResult runBatchedWithCache(Path root, Integer parallelism, int b
592606 // Zero data loss: run regex-only fallback instead of skipping
593607 DiscoveredFile timedOutFile = batch .get (i );
594608 log .warn ("⏱️ ANTLR timed out for {} (30s), running regex fallback" , timedOutFile .path ());
595- resultSlots [i ] = analyzeFileRegexOnly (timedOutFile , root , detectorRegistry );
609+ DetectorResult regexResult = analyzeFileRegexOnly (timedOutFile , root , detectorRegistry );
610+ resultSlots [i ] = regexResult ;
611+ // Store regex fallback result to cache with explicit detection_method
612+ if (incremental && regexResult != null
613+ && (!regexResult .nodes ().isEmpty () || !regexResult .edges ().isEmpty ())) {
614+ try {
615+ Path absPath = root .resolve (timedOutFile .path ());
616+ String hash = FileHasher .hash (absPath );
617+ cache .storeResults (hash , timedOutFile .path ().toString (),
618+ timedOutFile .language (), regexResult .nodes (), regexResult .edges (),
619+ "DETECTED" , "regex_fallback" );
620+ } catch (IOException ioe ) {
621+ log .debug ("Could not hash for regex fallback cache: {}" , timedOutFile .path (), ioe );
622+ }
623+ }
596624 } catch (ExecutionException e ) {
597625 log .warn ("Analysis failed for {}" , batch .get (i ).path (), e .getCause ());
598626 } catch (InterruptedException e ) {
@@ -820,6 +848,8 @@ private AnalysisResult runSmartWithCache(Path root, Integer parallelism, int bat
820848
821849 try (var executor = createExecutor (parallelism )) {
822850 List <DiscoveredFile > pendingBatch = new ArrayList <>(batchSize );
851+ List <CodeNode > filteredNodes = new ArrayList <>();
852+ Map <String , String > contentCache = new HashMap <>();
823853 int moduleIndex = 0 ;
824854
825855 for (String moduleKey : sortedModuleKeys ) {
@@ -828,7 +858,8 @@ private AnalysisResult runSmartWithCache(Path root, Integer parallelism, int bat
828858 report .accept ("Processing module " + moduleIndex + "/" + sortedModuleKeys .size ()
829859 + ": " + moduleKey + " (" + moduleFiles .size () + " files)" );
830860
831- // Pre-filter source files with keyword filter; always pass structured files
861+ // Pre-filter source files with keyword filter; always pass structured files.
862+ // Cache decoded content from the keyword filter to avoid re-reading in analyzeFile.
832863 List <DiscoveredFile > filtered = new ArrayList <>(moduleFiles .size ());
833864 for (DiscoveredFile file : moduleFiles ) {
834865 if (STRUCTURED_LANGUAGES .contains (file .language ())) {
@@ -841,8 +872,22 @@ private AnalysisResult runSmartWithCache(Path root, Integer parallelism, int bat
841872 byte [] raw = Files .readAllBytes (absPath );
842873 if (keywordFilter .shouldAnalyze (raw , file .language ())) {
843874 filtered .add (file );
875+ // Cache decoded content to avoid duplicate read in analyzeFile
876+ contentCache .put (file .path ().toString (),
877+ DetectorUtils .decodeContent (raw ));
844878 } else {
845879 filesSkipped ++;
880+ // Zero data loss: create minimal inventory node for filtered files
881+ CodeNode fileNode = new CodeNode (
882+ "file:" + file .path () + ":module:" + file .path ().getFileName (),
883+ NodeKind .MODULE ,
884+ file .path ().getFileName ().toString ());
885+ fileNode .setFilePath (file .path ().toString ());
886+ fileNode .setModule (DetectorUtils .deriveModuleName (file .path ().toString (), file .language ()));
887+ fileNode .getProperties ().put ("status" , "filtered" );
888+ fileNode .getProperties ().put ("language" , file .language ());
889+ fileNode .getProperties ().put ("detection_method" , "none" );
890+ filteredNodes .add (fileNode );
846891 log .debug ("⏭️ SKIP: {} ({}, {} bytes) — no architecture keywords" ,
847892 file .path (), file .language (), raw .length );
848893 }
@@ -861,29 +906,33 @@ private AnalysisResult runSmartWithCache(Path root, Integer parallelism, int bat
861906 var batchResult = processSmartBatch (pendingBatch , root , executor .delegate (),
862907 detectorRegistry , infraRegistry , incremental , cache ,
863908 nodeBreakdown , edgeBreakdown , frameworkBreakdown ,
864- batchNumber , report );
909+ batchNumber , report , contentCache , filteredNodes );
865910 totalNodesWritten += batchResult [0 ];
866911 totalEdgesWritten += batchResult [1 ];
867912 filesAnalyzed += batchResult [2 ];
868913 cacheHits += batchResult [3 ];
869914 pendingBatch .clear ();
915+ filteredNodes .clear ();
870916 }
871917 }
872918 }
873919
874- // Flush remaining files
875- if (!pendingBatch .isEmpty ()) {
920+ // Flush remaining files (including any accumulated filtered nodes)
921+ if (!pendingBatch .isEmpty () || ! filteredNodes . isEmpty () ) {
876922 batchNumber ++;
877923 var batchResult = processSmartBatch (pendingBatch , root , executor .delegate (),
878924 detectorRegistry , infraRegistry , incremental , cache ,
879925 nodeBreakdown , edgeBreakdown , frameworkBreakdown ,
880- batchNumber , report );
926+ batchNumber , report , contentCache , filteredNodes );
881927 totalNodesWritten += batchResult [0 ];
882928 totalEdgesWritten += batchResult [1 ];
883929 filesAnalyzed += batchResult [2 ];
884930 cacheHits += batchResult [3 ];
885931 pendingBatch .clear ();
932+ filteredNodes .clear ();
886933 }
934+ // Clear content cache after all batches in this module to free memory
935+ contentCache .clear ();
887936 }
888937
889938 if (filesSkipped > 0 ) {
@@ -922,7 +971,9 @@ private int[] processSmartBatch(
922971 boolean incremental , AnalysisCache cache ,
923972 Map <String , Integer > nodeBreakdown , Map <String , Integer > edgeBreakdown ,
924973 Map <String , Integer > frameworkBreakdown ,
925- int batchNumber , Consumer <String > report ) {
974+ int batchNumber , Consumer <String > report ,
975+ Map <String , String > contentCache ,
976+ List <CodeNode > filteredNodes ) {
926977
927978 report .accept ("Processing batch " + batchNumber + " (" + batch .size () + " files)..." );
928979 Instant batchStart = Instant .now ();
@@ -934,6 +985,7 @@ private int[] processSmartBatch(
934985 for (int i = 0 ; i < batch .size (); i ++) {
935986 final int idx = i ;
936987 final DiscoveredFile file = batch .get (idx );
988+ final String cachedContent = contentCache .remove (file .path ().toString ());
937989 futures .add (executor .submit (() -> {
938990 if (incremental ) {
939991 try {
@@ -947,18 +999,18 @@ private int[] processSmartBatch(
947999 return null ;
9481000 }
9491001 }
950- DetectorResult result = analyzeFileWithRegistry (file , root , detectorRegistry , infraRegistry );
1002+ DetectorResult result = analyzeFileWithRegistry (file , root , detectorRegistry , infraRegistry , cachedContent );
9511003 slots [idx ] = result ;
9521004 if (result != null && (!result .nodes ().isEmpty () || !result .edges ().isEmpty ())) {
9531005 cache .storeResults (hash , file .path ().toString (), file .language (),
9541006 result .nodes (), result .edges ());
9551007 }
9561008 } catch (IOException e ) {
9571009 log .debug ("Could not hash {}" , file .path (), e );
958- slots [idx ] = analyzeFileWithRegistry (file , root , detectorRegistry , infraRegistry );
1010+ slots [idx ] = analyzeFileWithRegistry (file , root , detectorRegistry , infraRegistry , cachedContent );
9591011 }
9601012 } else {
961- slots [idx ] = analyzeFileWithRegistry (file , root , detectorRegistry , infraRegistry );
1013+ slots [idx ] = analyzeFileWithRegistry (file , root , detectorRegistry , infraRegistry , cachedContent );
9621014 }
9631015 return null ;
9641016 }));
@@ -971,7 +1023,21 @@ private int[] processSmartBatch(
9711023 futures .get (i ).cancel (true );
9721024 DiscoveredFile timedOutFile = batch .get (i );
9731025 log .warn ("⏱️ ANTLR timed out for {} (30s), running regex fallback" , timedOutFile .path ());
974- slots [i ] = analyzeFileRegexOnly (timedOutFile , root , detectorRegistry );
1026+ DetectorResult regexResult = analyzeFileRegexOnly (timedOutFile , root , detectorRegistry );
1027+ slots [i ] = regexResult ;
1028+ // Store regex fallback result to cache with explicit detection_method
1029+ if (incremental && regexResult != null
1030+ && (!regexResult .nodes ().isEmpty () || !regexResult .edges ().isEmpty ())) {
1031+ try {
1032+ Path absPath = root .resolve (timedOutFile .path ());
1033+ String hash = FileHasher .hash (absPath );
1034+ cache .storeResults (hash , timedOutFile .path ().toString (),
1035+ timedOutFile .language (), regexResult .nodes (), regexResult .edges (),
1036+ "DETECTED" , "regex_fallback" );
1037+ } catch (IOException ioe ) {
1038+ log .debug ("Could not hash for regex fallback cache: {}" , timedOutFile .path (), ioe );
1039+ }
1040+ }
9751041 } catch (ExecutionException e ) {
9761042 log .warn ("Analysis failed for {}" , batch .get (i ).path (), e .getCause ());
9771043 } catch (InterruptedException e ) {
@@ -1020,6 +1086,15 @@ private int[] processSmartBatch(
10201086 }
10211087 }
10221088
1089+ // Add filtered file inventory nodes to batch results
1090+ if (filteredNodes != null && !filteredNodes .isEmpty ()) {
1091+ batchNodes .addAll (filteredNodes );
1092+ for (CodeNode fn : filteredNodes ) {
1093+ nodeBreakdown .merge (fn .getKind ().getValue (), 1 , Integer ::sum );
1094+ }
1095+ nodes += filteredNodes .size ();
1096+ }
1097+
10231098 if (!incremental && (!batchNodes .isEmpty () || !batchEdges .isEmpty ())) {
10241099 String batchId = "batch:" + batchNumber + ":" + System .nanoTime ();
10251100 cache .storeBatchResults (batchId , "batch-" + batchNumber , "mixed" , batchNodes , batchEdges );
@@ -1092,16 +1167,34 @@ Map<String, List<DiscoveredFile>> detectModules(Path root, List<DiscoveredFile>
10921167 DetectorResult analyzeFileWithRegistry (DiscoveredFile file , Path repoPath ,
10931168 DetectorRegistry detectorRegistry ,
10941169 InfrastructureRegistry infraRegistry ) {
1170+ return analyzeFileWithRegistry (file , repoPath , detectorRegistry , infraRegistry , null );
1171+ }
1172+
1173+ /**
1174+ * Analyze a single file using the given registries, optionally with pre-read content.
1175+ * When {@code preReadContent} is non-null, it is used directly instead of reading from disk,
1176+ * avoiding a duplicate file read (the content was already read during keyword filtering).
1177+ *
1178+ * @param preReadContent decoded file content from the keyword filter, or null to read from disk
1179+ */
1180+ DetectorResult analyzeFileWithRegistry (DiscoveredFile file , Path repoPath ,
1181+ DetectorRegistry detectorRegistry ,
1182+ InfrastructureRegistry infraRegistry ,
1183+ String preReadContent ) {
10951184 Instant fileStart = Instant .now ();
1096- Path absPath = repoPath .resolve (file .path ());
10971185
10981186 String content ;
1099- try {
1100- byte [] raw = Files .readAllBytes (absPath );
1101- content = DetectorUtils .decodeContent (raw );
1102- } catch (IOException e ) {
1103- log .debug ("Could not read file: {}" , absPath , e );
1104- return DetectorResult .empty ();
1187+ if (preReadContent != null ) {
1188+ content = preReadContent ;
1189+ } else {
1190+ Path absPath = repoPath .resolve (file .path ());
1191+ try {
1192+ byte [] raw = Files .readAllBytes (absPath );
1193+ content = DetectorUtils .decodeContent (raw );
1194+ } catch (IOException e ) {
1195+ log .debug ("Could not read file: {}" , absPath , e );
1196+ return DetectorResult .empty ();
1197+ }
11051198 }
11061199
11071200 if (isMinified (file , content )) {
0 commit comments