|
12 | 12 | import io.github.randomcodespace.iq.detector.DetectorResult; |
13 | 13 | import io.github.randomcodespace.iq.detector.DetectorUtils; |
14 | 14 | import io.github.randomcodespace.iq.grammar.AntlrParserFactory; |
| 15 | +import io.github.randomcodespace.iq.model.CodeEdge; |
15 | 16 | import io.github.randomcodespace.iq.model.CodeNode; |
16 | 17 | import io.github.randomcodespace.iq.model.NodeKind; |
17 | 18 | import org.slf4j.Logger; |
@@ -347,6 +348,249 @@ private AnalysisResult runWithCache(Path root, Integer parallelism, AnalysisCach |
347 | 348 | ); |
348 | 349 | } |
349 | 350 |
|
| 351 | + /** |
| 352 | + * Execute the indexing pipeline with batched streaming to H2. |
| 353 | + * <p> |
| 354 | + * Unlike {@link #run}, this method does NOT hold all nodes/edges in memory. |
| 355 | + * It processes files in batches and flushes each batch to H2, then releases |
| 356 | + * the batch memory. No linkers, layer classification, or Neo4j are used. |
| 357 | + * |
| 358 | + * @param repoPath root of the repository to analyze |
| 359 | + * @param parallelism max parallel threads, or null for adaptive (virtual threads) |
| 360 | + * @param batchSize number of files per H2 flush batch |
| 361 | + * @param incremental if true, use file content hashing to skip unchanged files |
| 362 | + * @param onProgress optional callback for progress reporting (may be null) |
| 363 | + * @return the analysis result containing graph data and statistics |
| 364 | + */ |
| 365 | + public AnalysisResult runBatchedIndex(Path repoPath, Integer parallelism, int batchSize, |
| 366 | + boolean incremental, Consumer<String> onProgress) { |
| 367 | + Instant start = Instant.now(); |
| 368 | + Consumer<String> report = onProgress != null ? onProgress : msg -> {}; |
| 369 | + |
| 370 | + final Path root = repoPath.toAbsolutePath().normalize(); |
| 371 | + |
| 372 | + // Always use H2 cache as the primary store during indexing |
| 373 | + Path cachePath = root.resolve(config.getCacheDir()).resolve("analysis-cache.db"); |
| 374 | + AnalysisCache cache; |
| 375 | + try { |
| 376 | + cache = new AnalysisCache(cachePath); |
| 377 | + } catch (Exception e) { |
| 378 | + log.error("Failed to open H2 store at {}", cachePath, e); |
| 379 | + return new AnalysisResult(0, 0, 0, 0, |
| 380 | + Map.of(), Map.of(), Map.of(), Map.of(), Duration.ZERO); |
| 381 | + } |
| 382 | + |
| 383 | + try { |
| 384 | + return runBatchedWithCache(root, parallelism, batchSize, incremental, cache, report, start); |
| 385 | + } finally { |
| 386 | + cache.close(); |
| 387 | + } |
| 388 | + } |
| 389 | + |
| 390 | + private AnalysisResult runBatchedWithCache(Path root, Integer parallelism, int batchSize, |
| 391 | + boolean incremental, AnalysisCache cache, |
| 392 | + Consumer<String> report, Instant start) { |
| 393 | + // 0. Load project config for pipeline filtering |
| 394 | + ProjectConfig projectConfig = ProjectConfigLoader.loadProjectConfig(root); |
| 395 | + DetectorRegistry effectiveRegistry = registry; |
| 396 | + |
| 397 | + if (projectConfig.hasDetectorCategoryFilter()) { |
| 398 | + effectiveRegistry = effectiveRegistry.filterByCategories( |
| 399 | + projectConfig.getDetectorCategories()); |
| 400 | + report.accept("Detector categories: " + projectConfig.getDetectorCategories()); |
| 401 | + } |
| 402 | + if (projectConfig.hasDetectorIncludeFilter()) { |
| 403 | + effectiveRegistry = effectiveRegistry.filterByNames( |
| 404 | + projectConfig.getDetectorInclude()); |
| 405 | + report.accept("Detector include: " + projectConfig.getDetectorInclude()); |
| 406 | + } |
| 407 | + if (parallelism == null && projectConfig.getPipelineParallelism() != null) { |
| 408 | + parallelism = projectConfig.getPipelineParallelism(); |
| 409 | + report.accept("Pipeline parallelism: " + parallelism + " (from config)"); |
| 410 | + } |
| 411 | + |
| 412 | + // 1. Discover files |
| 413 | + report.accept("Discovering files..."); |
| 414 | + List<DiscoveredFile> files = fileDiscovery.discover(root); |
| 415 | + |
| 416 | + if (projectConfig.hasLanguageFilter()) { |
| 417 | + Set<String> allowedLanguages = new HashSet<>(projectConfig.getLanguages()); |
| 418 | + files = files.stream() |
| 419 | + .filter(f -> allowedLanguages.contains(f.language())) |
| 420 | + .toList(); |
| 421 | + report.accept("Language filter active: " + projectConfig.getLanguages()); |
| 422 | + } |
| 423 | + if (projectConfig.hasExcludePatterns()) { |
| 424 | + List<String> excludes = projectConfig.getExclude(); |
| 425 | + files = files.stream() |
| 426 | + .filter(f -> !matchesAnyExclude(f.path().toString(), excludes)) |
| 427 | + .toList(); |
| 428 | + report.accept("Exclude patterns: " + excludes); |
| 429 | + } |
| 430 | + |
| 431 | + int totalFiles = files.size(); |
| 432 | + report.accept("Found " + totalFiles + " files"); |
| 433 | + |
| 434 | + // Compute language breakdown |
| 435 | + Map<String, Integer> languageBreakdown = new HashMap<>(); |
| 436 | + for (DiscoveredFile f : files) { |
| 437 | + languageBreakdown.merge(f.language(), 1, Integer::sum); |
| 438 | + } |
| 439 | + |
| 440 | + // 2. Process files in batches |
| 441 | + report.accept("Indexing " + totalFiles + " files in batches of " + batchSize + "..."); |
| 442 | + |
| 443 | + final DetectorRegistry detectorRegistry = effectiveRegistry; |
| 444 | + int totalNodesWritten = 0; |
| 445 | + int totalEdgesWritten = 0; |
| 446 | + int filesAnalyzed = 0; |
| 447 | + int cacheHits = 0; |
| 448 | + int batchNumber = 0; |
| 449 | + Map<String, Integer> nodeBreakdown = new HashMap<>(); |
| 450 | + Map<String, Integer> edgeBreakdown = new HashMap<>(); |
| 451 | + Map<String, Integer> frameworkBreakdown = new HashMap<>(); |
| 452 | + |
| 453 | + // Clear previous index data if not incremental |
| 454 | + if (!incremental) { |
| 455 | + cache.clear(); |
| 456 | + } |
| 457 | + |
| 458 | + List<DiscoveredFile> batch = new ArrayList<>(batchSize); |
| 459 | + for (int fileIdx = 0; fileIdx < files.size(); fileIdx++) { |
| 460 | + batch.add(files.get(fileIdx)); |
| 461 | + |
| 462 | + if (batch.size() >= batchSize || fileIdx == files.size() - 1) { |
| 463 | + batchNumber++; |
| 464 | + report.accept("Processing batch " + batchNumber + " (" + batch.size() + " files)..."); |
| 465 | + |
| 466 | + // Analyze batch in parallel |
| 467 | + DetectorResult[] resultSlots = new DetectorResult[batch.size()]; |
| 468 | + int[] batchCacheHits = {0}; |
| 469 | + |
| 470 | + var executorService = parallelism != null && parallelism > 0 |
| 471 | + ? Executors.newFixedThreadPool(parallelism) |
| 472 | + : Executors.newVirtualThreadPerTaskExecutor(); |
| 473 | + try (var executor = executorService) { |
| 474 | + List<Future<?>> futures = new ArrayList<>(batch.size()); |
| 475 | + for (int i = 0; i < batch.size(); i++) { |
| 476 | + final int idx = i; |
| 477 | + final DiscoveredFile file = batch.get(idx); |
| 478 | + futures.add(executor.submit(() -> { |
| 479 | + if (incremental) { |
| 480 | + try { |
| 481 | + Path absPath = root.resolve(file.path()); |
| 482 | + String hash = FileHasher.hash(absPath); |
| 483 | + if (cache.isCached(hash)) { |
| 484 | + var cached = cache.loadCachedResults(hash); |
| 485 | + if (cached != null) { |
| 486 | + resultSlots[idx] = DetectorResult.of(cached.nodes(), cached.edges()); |
| 487 | + synchronized (batchCacheHits) { |
| 488 | + batchCacheHits[0]++; |
| 489 | + } |
| 490 | + return null; |
| 491 | + } |
| 492 | + } |
| 493 | + DetectorResult result = analyzeFile(file, root, detectorRegistry); |
| 494 | + resultSlots[idx] = result; |
| 495 | + if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) { |
| 496 | + cache.storeResults(hash, file.path().toString(), file.language(), |
| 497 | + result.nodes(), result.edges()); |
| 498 | + } |
| 499 | + } catch (IOException e) { |
| 500 | + log.debug("Could not hash file {}", file.path(), e); |
| 501 | + resultSlots[idx] = analyzeFile(file, root, detectorRegistry); |
| 502 | + } |
| 503 | + } else { |
| 504 | + resultSlots[idx] = analyzeFile(file, root, detectorRegistry); |
| 505 | + } |
| 506 | + return null; |
| 507 | + })); |
| 508 | + } |
| 509 | + |
| 510 | + // Collect in order |
| 511 | + for (int i = 0; i < futures.size(); i++) { |
| 512 | + try { |
| 513 | + futures.get(i).get(); |
| 514 | + } catch (ExecutionException e) { |
| 515 | + log.warn("Analysis failed for {}", batch.get(i).path(), e.getCause()); |
| 516 | + } catch (InterruptedException e) { |
| 517 | + Thread.currentThread().interrupt(); |
| 518 | + log.warn("Analysis interrupted for {}", batch.get(i).path()); |
| 519 | + } |
| 520 | + } |
| 521 | + } |
| 522 | + |
| 523 | + cacheHits += batchCacheHits[0]; |
| 524 | + |
| 525 | + // Collect batch results and flush non-cached to H2 |
| 526 | + List<CodeNode> batchNodes = new ArrayList<>(); |
| 527 | + List<CodeEdge> batchEdges = new ArrayList<>(); |
| 528 | + int batchFilesAnalyzed = 0; |
| 529 | + |
| 530 | + for (int i = 0; i < resultSlots.length; i++) { |
| 531 | + DetectorResult result = resultSlots[i]; |
| 532 | + if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) { |
| 533 | + batchFilesAnalyzed++; |
| 534 | + // Only store non-incremental results (incremental already stored above) |
| 535 | + if (!incremental) { |
| 536 | + batchNodes.addAll(result.nodes()); |
| 537 | + batchEdges.addAll(result.edges()); |
| 538 | + } |
| 539 | + // Track breakdowns |
| 540 | + for (CodeNode node : result.nodes()) { |
| 541 | + nodeBreakdown.merge(node.getKind().getValue(), 1, Integer::sum); |
| 542 | + Object fw = node.getProperties().get("framework"); |
| 543 | + if (fw != null && !fw.toString().isEmpty()) { |
| 544 | + frameworkBreakdown.merge(fw.toString(), 1, Integer::sum); |
| 545 | + } |
| 546 | + } |
| 547 | + for (var edge : result.edges()) { |
| 548 | + edgeBreakdown.merge(edge.getKind().getValue(), 1, Integer::sum); |
| 549 | + } |
| 550 | + totalNodesWritten += result.nodes().size(); |
| 551 | + totalEdgesWritten += result.edges().size(); |
| 552 | + } |
| 553 | + } |
| 554 | + |
| 555 | + filesAnalyzed += batchFilesAnalyzed; |
| 556 | + |
| 557 | + // For non-incremental mode, batch-flush to H2 |
| 558 | + if (!incremental && (!batchNodes.isEmpty() || !batchEdges.isEmpty())) { |
| 559 | + String batchId = "batch:" + batchNumber + ":" + System.nanoTime(); |
| 560 | + cache.storeBatchResults(batchId, "batch-" + batchNumber, |
| 561 | + "mixed", batchNodes, batchEdges); |
| 562 | + } |
| 563 | + |
| 564 | + // Release batch memory |
| 565 | + batch.clear(); |
| 566 | + } |
| 567 | + } |
| 568 | + |
| 569 | + if (cacheHits > 0) { |
| 570 | + report.accept("Cache hits: " + cacheHits + " / " + totalFiles + " files"); |
| 571 | + } |
| 572 | + |
| 573 | + // Record run |
| 574 | + String commitSha = getGitHead(root); |
| 575 | + cache.recordRun(commitSha, filesAnalyzed); |
| 576 | + |
| 577 | + Duration elapsed = Duration.between(start, Instant.now()); |
| 578 | + report.accept("Index complete - " + totalNodesWritten + " nodes, " |
| 579 | + + totalEdgesWritten + " edges written to H2"); |
| 580 | + |
| 581 | + return new AnalysisResult( |
| 582 | + totalFiles, |
| 583 | + filesAnalyzed, |
| 584 | + totalNodesWritten, |
| 585 | + totalEdgesWritten, |
| 586 | + languageBreakdown, |
| 587 | + nodeBreakdown, |
| 588 | + edgeBreakdown, |
| 589 | + frameworkBreakdown, |
| 590 | + elapsed |
| 591 | + ); |
| 592 | + } |
| 593 | + |
350 | 594 | /** |
351 | 595 | * Check whether a file is minified (e.g. *.min.js, *.bundle.js) and large |
352 | 596 | * enough that running detectors would be wasteful. |
|
0 commit comments