Skip to content

Commit 4304caf

Browse files
aksOpsclaude
andcommitted
feat: file inventory nodes, H2 status tracking, and performance optimizations
Zero data loss architecture (Part 2): - File inventory: keyword-filtered files now get MODULE nodes with status=filtered, language, detection_method=none — complete graph coverage - H2 status tracking: files table now has status (DETECTED/FILTERED/FAILED) and detection_method (antlr/regex/regex_fallback/none) columns - Cache version bumped to 3 for schema migration Performance optimizations: - Eliminate duplicate file reads: content from keyword filter is cached and passed to analyzeFile, avoiding second Files.readAllBytes() call (~25MB per batch of 500 files, cleared after each batch) - DocCommentExtractor file grouping: LexicalEnricher groups nodes by filePath, reads file once per group instead of per-node. 100 nodes in one file = 1 disk read instead of 100. - TopologyService: pre-build degree maps O(N+M) instead of O(N*M) stream filtering per service. HashSet for cycle deduplication. - GoWebDetector: bounded regex [^\n]*? instead of .*? with DOTALL to prevent exponential backtracking. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 6886769 commit 4304caf

7 files changed

Lines changed: 221 additions & 55 deletions

File tree

src/main/java/io/github/randomcodespace/iq/analyzer/Analyzer.java

Lines changed: 112 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,21 @@ private AnalysisResult runWithCache(Path root, Integer parallelism, AnalysisCach
300300
futures.get(i).cancel(true);
301301
DiscoveredFile timedOutFile = files.get(i);
302302
log.warn("⏱️ ANTLR timed out for {} (30s), running regex fallback", timedOutFile.path());
303-
resultSlots[i] = analyzeFileRegexOnly(timedOutFile, root, detectorRegistry);
303+
DetectorResult regexResult = analyzeFileRegexOnly(timedOutFile, root, detectorRegistry);
304+
resultSlots[i] = regexResult;
305+
// Store regex fallback result to cache with explicit detection_method
306+
if (cache != null && regexResult != null
307+
&& (!regexResult.nodes().isEmpty() || !regexResult.edges().isEmpty())) {
308+
try {
309+
Path absPath = root.resolve(timedOutFile.path());
310+
String hash = FileHasher.hash(absPath);
311+
cache.storeResults(hash, timedOutFile.path().toString(),
312+
timedOutFile.language(), regexResult.nodes(), regexResult.edges(),
313+
"DETECTED", "regex_fallback");
314+
} catch (IOException ioe) {
315+
log.debug("Could not hash for regex fallback cache: {}", timedOutFile.path(), ioe);
316+
}
317+
}
304318
} catch (ExecutionException e) {
305319
log.warn("Analysis failed for {}", files.get(i).path(), e.getCause());
306320
} catch (InterruptedException e) {
@@ -592,7 +606,21 @@ private AnalysisResult runBatchedWithCache(Path root, Integer parallelism, int b
592606
// Zero data loss: run regex-only fallback instead of skipping
593607
DiscoveredFile timedOutFile = batch.get(i);
594608
log.warn("⏱️ ANTLR timed out for {} (30s), running regex fallback", timedOutFile.path());
595-
resultSlots[i] = analyzeFileRegexOnly(timedOutFile, root, detectorRegistry);
609+
DetectorResult regexResult = analyzeFileRegexOnly(timedOutFile, root, detectorRegistry);
610+
resultSlots[i] = regexResult;
611+
// Store regex fallback result to cache with explicit detection_method
612+
if (incremental && regexResult != null
613+
&& (!regexResult.nodes().isEmpty() || !regexResult.edges().isEmpty())) {
614+
try {
615+
Path absPath = root.resolve(timedOutFile.path());
616+
String hash = FileHasher.hash(absPath);
617+
cache.storeResults(hash, timedOutFile.path().toString(),
618+
timedOutFile.language(), regexResult.nodes(), regexResult.edges(),
619+
"DETECTED", "regex_fallback");
620+
} catch (IOException ioe) {
621+
log.debug("Could not hash for regex fallback cache: {}", timedOutFile.path(), ioe);
622+
}
623+
}
596624
} catch (ExecutionException e) {
597625
log.warn("Analysis failed for {}", batch.get(i).path(), e.getCause());
598626
} catch (InterruptedException e) {
@@ -820,6 +848,8 @@ private AnalysisResult runSmartWithCache(Path root, Integer parallelism, int bat
820848

821849
try (var executor = createExecutor(parallelism)) {
822850
List<DiscoveredFile> pendingBatch = new ArrayList<>(batchSize);
851+
List<CodeNode> filteredNodes = new ArrayList<>();
852+
Map<String, String> contentCache = new HashMap<>();
823853
int moduleIndex = 0;
824854

825855
for (String moduleKey : sortedModuleKeys) {
@@ -828,7 +858,8 @@ private AnalysisResult runSmartWithCache(Path root, Integer parallelism, int bat
828858
report.accept("Processing module " + moduleIndex + "/" + sortedModuleKeys.size()
829859
+ ": " + moduleKey + " (" + moduleFiles.size() + " files)");
830860

831-
// Pre-filter source files with keyword filter; always pass structured files
861+
// Pre-filter source files with keyword filter; always pass structured files.
862+
// Cache decoded content from the keyword filter to avoid re-reading in analyzeFile.
832863
List<DiscoveredFile> filtered = new ArrayList<>(moduleFiles.size());
833864
for (DiscoveredFile file : moduleFiles) {
834865
if (STRUCTURED_LANGUAGES.contains(file.language())) {
@@ -841,8 +872,22 @@ private AnalysisResult runSmartWithCache(Path root, Integer parallelism, int bat
841872
byte[] raw = Files.readAllBytes(absPath);
842873
if (keywordFilter.shouldAnalyze(raw, file.language())) {
843874
filtered.add(file);
875+
// Cache decoded content to avoid duplicate read in analyzeFile
876+
contentCache.put(file.path().toString(),
877+
DetectorUtils.decodeContent(raw));
844878
} else {
845879
filesSkipped++;
880+
// Zero data loss: create minimal inventory node for filtered files
881+
CodeNode fileNode = new CodeNode(
882+
"file:" + file.path() + ":module:" + file.path().getFileName(),
883+
NodeKind.MODULE,
884+
file.path().getFileName().toString());
885+
fileNode.setFilePath(file.path().toString());
886+
fileNode.setModule(DetectorUtils.deriveModuleName(file.path().toString(), file.language()));
887+
fileNode.getProperties().put("status", "filtered");
888+
fileNode.getProperties().put("language", file.language());
889+
fileNode.getProperties().put("detection_method", "none");
890+
filteredNodes.add(fileNode);
846891
log.debug("⏭️ SKIP: {} ({}, {} bytes) — no architecture keywords",
847892
file.path(), file.language(), raw.length);
848893
}
@@ -861,29 +906,33 @@ private AnalysisResult runSmartWithCache(Path root, Integer parallelism, int bat
861906
var batchResult = processSmartBatch(pendingBatch, root, executor.delegate(),
862907
detectorRegistry, infraRegistry, incremental, cache,
863908
nodeBreakdown, edgeBreakdown, frameworkBreakdown,
864-
batchNumber, report);
909+
batchNumber, report, contentCache, filteredNodes);
865910
totalNodesWritten += batchResult[0];
866911
totalEdgesWritten += batchResult[1];
867912
filesAnalyzed += batchResult[2];
868913
cacheHits += batchResult[3];
869914
pendingBatch.clear();
915+
filteredNodes.clear();
870916
}
871917
}
872918
}
873919

874-
// Flush remaining files
875-
if (!pendingBatch.isEmpty()) {
920+
// Flush remaining files (including any accumulated filtered nodes)
921+
if (!pendingBatch.isEmpty() || !filteredNodes.isEmpty()) {
876922
batchNumber++;
877923
var batchResult = processSmartBatch(pendingBatch, root, executor.delegate(),
878924
detectorRegistry, infraRegistry, incremental, cache,
879925
nodeBreakdown, edgeBreakdown, frameworkBreakdown,
880-
batchNumber, report);
926+
batchNumber, report, contentCache, filteredNodes);
881927
totalNodesWritten += batchResult[0];
882928
totalEdgesWritten += batchResult[1];
883929
filesAnalyzed += batchResult[2];
884930
cacheHits += batchResult[3];
885931
pendingBatch.clear();
932+
filteredNodes.clear();
886933
}
934+
// Clear content cache after all batches in this module to free memory
935+
contentCache.clear();
887936
}
888937

889938
if (filesSkipped > 0) {
@@ -922,7 +971,9 @@ private int[] processSmartBatch(
922971
boolean incremental, AnalysisCache cache,
923972
Map<String, Integer> nodeBreakdown, Map<String, Integer> edgeBreakdown,
924973
Map<String, Integer> frameworkBreakdown,
925-
int batchNumber, Consumer<String> report) {
974+
int batchNumber, Consumer<String> report,
975+
Map<String, String> contentCache,
976+
List<CodeNode> filteredNodes) {
926977

927978
report.accept("Processing batch " + batchNumber + " (" + batch.size() + " files)...");
928979
Instant batchStart = Instant.now();
@@ -934,6 +985,7 @@ private int[] processSmartBatch(
934985
for (int i = 0; i < batch.size(); i++) {
935986
final int idx = i;
936987
final DiscoveredFile file = batch.get(idx);
988+
final String cachedContent = contentCache.remove(file.path().toString());
937989
futures.add(executor.submit(() -> {
938990
if (incremental) {
939991
try {
@@ -947,18 +999,18 @@ private int[] processSmartBatch(
947999
return null;
9481000
}
9491001
}
950-
DetectorResult result = analyzeFileWithRegistry(file, root, detectorRegistry, infraRegistry);
1002+
DetectorResult result = analyzeFileWithRegistry(file, root, detectorRegistry, infraRegistry, cachedContent);
9511003
slots[idx] = result;
9521004
if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) {
9531005
cache.storeResults(hash, file.path().toString(), file.language(),
9541006
result.nodes(), result.edges());
9551007
}
9561008
} catch (IOException e) {
9571009
log.debug("Could not hash {}", file.path(), e);
958-
slots[idx] = analyzeFileWithRegistry(file, root, detectorRegistry, infraRegistry);
1010+
slots[idx] = analyzeFileWithRegistry(file, root, detectorRegistry, infraRegistry, cachedContent);
9591011
}
9601012
} else {
961-
slots[idx] = analyzeFileWithRegistry(file, root, detectorRegistry, infraRegistry);
1013+
slots[idx] = analyzeFileWithRegistry(file, root, detectorRegistry, infraRegistry, cachedContent);
9621014
}
9631015
return null;
9641016
}));
@@ -971,7 +1023,21 @@ private int[] processSmartBatch(
9711023
futures.get(i).cancel(true);
9721024
DiscoveredFile timedOutFile = batch.get(i);
9731025
log.warn("⏱️ ANTLR timed out for {} (30s), running regex fallback", timedOutFile.path());
974-
slots[i] = analyzeFileRegexOnly(timedOutFile, root, detectorRegistry);
1026+
DetectorResult regexResult = analyzeFileRegexOnly(timedOutFile, root, detectorRegistry);
1027+
slots[i] = regexResult;
1028+
// Store regex fallback result to cache with explicit detection_method
1029+
if (incremental && regexResult != null
1030+
&& (!regexResult.nodes().isEmpty() || !regexResult.edges().isEmpty())) {
1031+
try {
1032+
Path absPath = root.resolve(timedOutFile.path());
1033+
String hash = FileHasher.hash(absPath);
1034+
cache.storeResults(hash, timedOutFile.path().toString(),
1035+
timedOutFile.language(), regexResult.nodes(), regexResult.edges(),
1036+
"DETECTED", "regex_fallback");
1037+
} catch (IOException ioe) {
1038+
log.debug("Could not hash for regex fallback cache: {}", timedOutFile.path(), ioe);
1039+
}
1040+
}
9751041
} catch (ExecutionException e) {
9761042
log.warn("Analysis failed for {}", batch.get(i).path(), e.getCause());
9771043
} catch (InterruptedException e) {
@@ -1020,6 +1086,15 @@ private int[] processSmartBatch(
10201086
}
10211087
}
10221088

1089+
// Add filtered file inventory nodes to batch results
1090+
if (filteredNodes != null && !filteredNodes.isEmpty()) {
1091+
batchNodes.addAll(filteredNodes);
1092+
for (CodeNode fn : filteredNodes) {
1093+
nodeBreakdown.merge(fn.getKind().getValue(), 1, Integer::sum);
1094+
}
1095+
nodes += filteredNodes.size();
1096+
}
1097+
10231098
if (!incremental && (!batchNodes.isEmpty() || !batchEdges.isEmpty())) {
10241099
String batchId = "batch:" + batchNumber + ":" + System.nanoTime();
10251100
cache.storeBatchResults(batchId, "batch-" + batchNumber, "mixed", batchNodes, batchEdges);
@@ -1092,16 +1167,34 @@ Map<String, List<DiscoveredFile>> detectModules(Path root, List<DiscoveredFile>
10921167
DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
10931168
DetectorRegistry detectorRegistry,
10941169
InfrastructureRegistry infraRegistry) {
1170+
return analyzeFileWithRegistry(file, repoPath, detectorRegistry, infraRegistry, null);
1171+
}
1172+
1173+
/**
1174+
* Analyze a single file using the given registries, optionally with pre-read content.
1175+
* When {@code preReadContent} is non-null, it is used directly instead of reading from disk,
1176+
* avoiding a duplicate file read (the content was already read during keyword filtering).
1177+
*
1178+
* @param preReadContent decoded file content from the keyword filter, or null to read from disk
1179+
*/
1180+
DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
1181+
DetectorRegistry detectorRegistry,
1182+
InfrastructureRegistry infraRegistry,
1183+
String preReadContent) {
10951184
Instant fileStart = Instant.now();
1096-
Path absPath = repoPath.resolve(file.path());
10971185

10981186
String content;
1099-
try {
1100-
byte[] raw = Files.readAllBytes(absPath);
1101-
content = DetectorUtils.decodeContent(raw);
1102-
} catch (IOException e) {
1103-
log.debug("Could not read file: {}", absPath, e);
1104-
return DetectorResult.empty();
1187+
if (preReadContent != null) {
1188+
content = preReadContent;
1189+
} else {
1190+
Path absPath = repoPath.resolve(file.path());
1191+
try {
1192+
byte[] raw = Files.readAllBytes(absPath);
1193+
content = DetectorUtils.decodeContent(raw);
1194+
} catch (IOException e) {
1195+
log.debug("Could not read file: {}", absPath, e);
1196+
return DetectorResult.empty();
1197+
}
11051198
}
11061199

11071200
if (isMinified(file, content)) {

src/main/java/io/github/randomcodespace/iq/cache/AnalysisCache.java

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ public class AnalysisCache implements Closeable {
4141
private static final Logger log = LoggerFactory.getLogger(AnalysisCache.class);
4242

4343
/** Bump when hash algorithm or schema changes to force cache invalidation. */
44-
private static final int CACHE_VERSION = 2;
44+
private static final int CACHE_VERSION = 3;
4545

4646
private static final String SCHEMA_SQL = """
4747
CREATE TABLE IF NOT EXISTS cache_meta (
@@ -53,7 +53,9 @@ CREATE TABLE IF NOT EXISTS files (
5353
content_hash VARCHAR PRIMARY KEY,
5454
path VARCHAR NOT NULL,
5555
language VARCHAR NOT NULL,
56-
parsed_at VARCHAR NOT NULL
56+
parsed_at VARCHAR NOT NULL,
57+
status VARCHAR DEFAULT 'DETECTED',
58+
detection_method VARCHAR DEFAULT 'antlr'
5759
);
5860
5961
CREATE TABLE IF NOT EXISTS nodes (
@@ -230,22 +232,41 @@ public String getHashForPath(String filePath) {
230232
// --- Store results ---
231233

232234
/**
233-
* Persist analysis results for a single file.
235+
* Persist analysis results for a single file with default status and detection method.
234236
*/
235237
public void storeResults(String contentHash, String filePath, String language,
236238
List<CodeNode> nodes, List<CodeEdge> edges) {
239+
storeResults(contentHash, filePath, language, nodes, edges, "DETECTED", "antlr");
240+
}
241+
242+
/**
243+
* Persist analysis results for a single file with explicit status and detection method.
244+
*
245+
* @param contentHash content hash key
246+
* @param filePath file path
247+
* @param language programming language
248+
* @param nodes detected nodes
249+
* @param edges detected edges
250+
* @param status file status (e.g. "DETECTED", "filtered")
251+
* @param detectionMethod detection method used (e.g. "antlr", "regex_fallback", "none")
252+
*/
253+
public void storeResults(String contentHash, String filePath, String language,
254+
List<CodeNode> nodes, List<CodeEdge> edges,
255+
String status, String detectionMethod) {
237256
rwLock.writeLock().lock();
238257
try {
239258
conn.setAutoCommit(false);
240259
String now = Instant.now().toString();
241260

242261
// Upsert file record (H2 MySQL mode supports INSERT ... ON DUPLICATE KEY UPDATE)
243262
try (var stmt = conn.prepareStatement(
244-
"MERGE INTO files (content_hash, path, language, parsed_at) KEY (content_hash) VALUES (?, ?, ?, ?)")) {
263+
"MERGE INTO files (content_hash, path, language, parsed_at, status, detection_method) KEY (content_hash) VALUES (?, ?, ?, ?, ?, ?)")) {
245264
stmt.setString(1, contentHash);
246265
stmt.setString(2, filePath);
247266
stmt.setString(3, language);
248267
stmt.setString(4, now);
268+
stmt.setString(5, status);
269+
stmt.setString(6, detectionMethod);
249270
stmt.execute();
250271
}
251272

@@ -475,11 +496,13 @@ public void replaceAll(List<CodeNode> nodes, List<CodeEdge> edges) {
475496

476497
// Insert synthetic file record
477498
try (var stmt = conn.prepareStatement(
478-
"MERGE INTO files (content_hash, path, language, parsed_at) KEY (content_hash) VALUES (?, ?, ?, ?)")) {
499+
"MERGE INTO files (content_hash, path, language, parsed_at, status, detection_method) KEY (content_hash) VALUES (?, ?, ?, ?, ?, ?)")) {
479500
stmt.setString(1, syntheticHash);
480501
stmt.setString(2, "__enriched__");
481502
stmt.setString(3, "enriched");
482503
stmt.setString(4, now);
504+
stmt.setString(5, "ENRICHED");
505+
stmt.setString(6, "enriched");
483506
stmt.execute();
484507
}
485508

0 commit comments

Comments
 (0)