Skip to content

Commit f96d86a

Browse files
aksOpsclaude
andcommitted
feat: file type classification, test file detection, and searchable snippets
FileClassifier: - Classifies every file as SOURCE|CONFIG|TEST|GENERATED|MINIFIED|TEXT|BINARY - Test detection: test/tests/spec/__tests__ dirs + language-specific patterns (*Test.java, *_test.go, test_*.py, *.spec.ts, etc.) - Generated: .d.ts, .map, .lock, vendor/, generated/ dirs - Binary: images, fonts, compiled assets, archives Analyzer integration: - TEST, BINARY, GENERATED, TEXT files get inventory-only nodes (no detectors) tagged with file_type property — reduces graph noise, speeds up indexing - SOURCE and CONFIG files get full detection as before - All nodes tagged with file_type for filtering in queries/UI H2 searchable snippets: - files table: added file_type and snippet columns (cache version 4) - Snippet stores first 200 lines (max 10KB) of text files - searchSnippets(query) method enables text search across all files: "find all files referencing DATABASE_URL" even in non-indexed files Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 4304caf commit f96d86a

3 files changed

Lines changed: 330 additions & 19 deletions

File tree

src/main/java/io/github/randomcodespace/iq/analyzer/Analyzer.java

Lines changed: 127 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,11 @@ private AnalysisResult runWithCache(Path root, Integer parallelism, AnalysisCach
278278
DetectorResult result = analyzeFile(file, root, detectorRegistry);
279279
resultSlots[idx] = result;
280280
if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) {
281+
FileClassifier.FileType ft = FileClassifier.classify(file.path(), file.language());
282+
String snippet = computeSnippetFromFile(root.resolve(file.path()), ft);
281283
cacheRef.storeResults(hash, file.path().toString(), file.language(),
282-
result.nodes(), result.edges());
284+
result.nodes(), result.edges(), "DETECTED", "antlr",
285+
ft.name().toLowerCase(), snippet);
283286
}
284287
} catch (IOException e) {
285288
log.debug("Could not hash file {}", file.path(), e);
@@ -583,8 +586,11 @@ private AnalysisResult runBatchedWithCache(Path root, Integer parallelism, int b
583586
DetectorResult result = analyzeFile(file, root, detectorRegistry);
584587
resultSlots[idx] = result;
585588
if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) {
589+
FileClassifier.FileType ft = FileClassifier.classify(file.path(), file.language());
590+
String snippet = computeSnippetFromFile(root.resolve(file.path()), ft);
586591
cache.storeResults(hash, file.path().toString(), file.language(),
587-
result.nodes(), result.edges());
592+
result.nodes(), result.edges(), "DETECTED", "antlr",
593+
ft.name().toLowerCase(), snippet);
588594
}
589595
} catch (IOException e) {
590596
log.debug("Could not hash file {}", file.path(), e);
@@ -1002,8 +1008,13 @@ private int[] processSmartBatch(
10021008
DetectorResult result = analyzeFileWithRegistry(file, root, detectorRegistry, infraRegistry, cachedContent);
10031009
slots[idx] = result;
10041010
if (result != null && (!result.nodes().isEmpty() || !result.edges().isEmpty())) {
1011+
FileClassifier.FileType ft = FileClassifier.classify(file.path(), file.language());
1012+
String snippet = cachedContent != null
1013+
? computeSnippet(cachedContent, ft)
1014+
: computeSnippetFromFile(root.resolve(file.path()), ft);
10051015
cache.storeResults(hash, file.path().toString(), file.language(),
1006-
result.nodes(), result.edges());
1016+
result.nodes(), result.edges(), "DETECTED", "antlr",
1017+
ft.name().toLowerCase(), snippet);
10071018
}
10081019
} catch (IOException e) {
10091020
log.debug("Could not hash {}", file.path(), e);
@@ -1183,6 +1194,19 @@ DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
11831194
String preReadContent) {
11841195
Instant fileStart = Instant.now();
11851196

1197+
// Classify file type before reading content
1198+
FileClassifier.FileType fileType = FileClassifier.classify(file.path(), file.language());
1199+
1200+
// Binary files: inventory-only node, no content read needed
1201+
if (fileType == FileClassifier.FileType.BINARY) {
1202+
return createInventoryNode(file, "binary");
1203+
}
1204+
1205+
// Generated files: inventory-only node, skip detectors
1206+
if (fileType == FileClassifier.FileType.GENERATED) {
1207+
return createInventoryNode(file, "generated");
1208+
}
1209+
11861210
String content;
11871211
if (preReadContent != null) {
11881212
content = preReadContent;
@@ -1197,6 +1221,16 @@ DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
11971221
}
11981222
}
11991223

1224+
// Test files: inventory-only node with file_type=test
1225+
if (fileType == FileClassifier.FileType.TEST) {
1226+
return createInventoryNode(file, "test");
1227+
}
1228+
1229+
// Text files (unknown language): inventory-only node
1230+
if (fileType == FileClassifier.FileType.TEXT) {
1231+
return createInventoryNode(file, "text");
1232+
}
1233+
12001234
if (isMinified(file, content)) {
12011235
log.debug("Skipping detectors for minified file: {}", file.path());
12021236
String moduleName = DetectorUtils.deriveModuleName(file.path().toString(), file.language());
@@ -1206,10 +1240,13 @@ DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
12061240
file.path().getFileName().toString());
12071241
node.setFilePath(file.path().toString());
12081242
node.setModule(moduleName);
1209-
node.setProperties(new java.util.LinkedHashMap<>(Map.of("minified", true)));
1243+
node.setProperties(new java.util.LinkedHashMap<>(Map.of("minified", true, "file_type", "minified")));
12101244
return DetectorResult.of(List.of(node), List.of());
12111245
}
12121246

1247+
// SOURCE and CONFIG files: run detectors
1248+
String fileTypeStr = (fileType == FileClassifier.FileType.CONFIG) ? "config" : "source";
1249+
12131250
Object parsedData = null;
12141251
if (STRUCTURED_LANGUAGES.contains(file.language())) {
12151252
parsedData = parser.parse(file.language(), content, file.path().toString());
@@ -1258,12 +1295,12 @@ DetectorResult analyzeFileWithRegistry(DiscoveredFile file, Path repoPath,
12581295
log.info("🐢 SLOW: {} took {}ms", file.path(), fileMs);
12591296
}
12601297

1261-
if (moduleName != null) {
1262-
for (CodeNode node : allNodes) {
1263-
if (node.getModule() == null || node.getModule().isEmpty()) {
1264-
node.setModule(moduleName);
1265-
}
1298+
// Set module and file_type on all nodes
1299+
for (CodeNode node : allNodes) {
1300+
if (moduleName != null && (node.getModule() == null || node.getModule().isEmpty())) {
1301+
node.setModule(moduleName);
12661302
}
1303+
node.getProperties().put("file_type", fileTypeStr);
12671304
}
12681305

12691306
return DetectorResult.of(allNodes, allEdges);
@@ -1359,6 +1396,19 @@ DetectorResult analyzeFile(DiscoveredFile file, Path repoPath, DetectorRegistry
13591396
Instant fileStart = Instant.now();
13601397
Path absPath = repoPath.resolve(file.path());
13611398

1399+
// Classify file type before reading content
1400+
FileClassifier.FileType fileType = FileClassifier.classify(file.path(), file.language());
1401+
1402+
// Binary files: inventory-only node, no content read needed
1403+
if (fileType == FileClassifier.FileType.BINARY) {
1404+
return createInventoryNode(file, "binary");
1405+
}
1406+
1407+
// Generated files: inventory-only node, skip detectors
1408+
if (fileType == FileClassifier.FileType.GENERATED) {
1409+
return createInventoryNode(file, "generated");
1410+
}
1411+
13621412
// Read file content
13631413
String content;
13641414
try {
@@ -1369,6 +1419,16 @@ DetectorResult analyzeFile(DiscoveredFile file, Path repoPath, DetectorRegistry
13691419
return DetectorResult.empty();
13701420
}
13711421

1422+
// Test files: inventory-only node with file_type=test
1423+
if (fileType == FileClassifier.FileType.TEST) {
1424+
return createInventoryNode(file, "test");
1425+
}
1426+
1427+
// Text files (unknown language): inventory-only node
1428+
if (fileType == FileClassifier.FileType.TEXT) {
1429+
return createInventoryNode(file, "text");
1430+
}
1431+
13721432
// Minified file detection: create a node with minified=true but skip detectors
13731433
if (isMinified(file, content)) {
13741434
log.debug("Skipping detectors for minified file: {}", file.path());
@@ -1379,10 +1439,13 @@ DetectorResult analyzeFile(DiscoveredFile file, Path repoPath, DetectorRegistry
13791439
file.path().getFileName().toString());
13801440
node.setFilePath(file.path().toString());
13811441
node.setModule(moduleName);
1382-
node.setProperties(new java.util.LinkedHashMap<>(Map.of("minified", true)));
1442+
node.setProperties(new java.util.LinkedHashMap<>(Map.of("minified", true, "file_type", "minified")));
13831443
return DetectorResult.of(List.of(node), List.of());
13841444
}
13851445

1446+
// SOURCE and CONFIG files: run detectors
1447+
String fileTypeStr = (fileType == FileClassifier.FileType.CONFIG) ? "config" : "source";
1448+
13861449
// Parse structured data if applicable
13871450
Object parsedData = null;
13881451
if (STRUCTURED_LANGUAGES.contains(file.language())) {
@@ -1440,18 +1503,34 @@ DetectorResult analyzeFile(DiscoveredFile file, Path repoPath, DetectorRegistry
14401503
log.info("🐢 SLOW: {} took {}ms", file.path(), fileMs);
14411504
}
14421505

1443-
// Set module on all nodes that don't have one yet
1444-
if (moduleName != null) {
1445-
for (CodeNode node : allNodes) {
1446-
if (node.getModule() == null || node.getModule().isEmpty()) {
1447-
node.setModule(moduleName);
1448-
}
1506+
// Set module and file_type on all nodes
1507+
for (CodeNode node : allNodes) {
1508+
if (moduleName != null && (node.getModule() == null || node.getModule().isEmpty())) {
1509+
node.setModule(moduleName);
14491510
}
1511+
node.getProperties().put("file_type", fileTypeStr);
14501512
}
14511513

14521514
return DetectorResult.of(allNodes, allEdges);
14531515
}
14541516

1517+
/**
1518+
* Create an inventory-only node for files that should not have detectors run.
1519+
*/
1520+
private static DetectorResult createInventoryNode(DiscoveredFile file, String fileType) {
1521+
String moduleName = DetectorUtils.deriveModuleName(file.path().toString(), file.language());
1522+
CodeNode node = new CodeNode(
1523+
"file:" + file.path() + ":module:" + (moduleName != null ? moduleName : file.path().getFileName().toString()),
1524+
NodeKind.MODULE,
1525+
file.path().getFileName().toString());
1526+
node.setFilePath(file.path().toString());
1527+
node.setModule(moduleName);
1528+
node.setProperties(new java.util.LinkedHashMap<>(Map.of(
1529+
"file_type", fileType,
1530+
"language", file.language() != null ? file.language() : "")));
1531+
return DetectorResult.of(List.of(node), List.of());
1532+
}
1533+
14551534
/**
14561535
* Regex-only analysis fallback for files where ANTLR timed out.
14571536
* Ensures zero data loss — every file produces nodes via regex detection.
@@ -1544,6 +1623,38 @@ private String getGitHead(Path repoPath) {
15441623
return null;
15451624
}
15461625

1626+
/**
1627+
* Read file content and compute snippet. Returns null on error or for binary files.
1628+
*/
1629+
private static String computeSnippetFromFile(Path absPath, FileClassifier.FileType fileType) {
1630+
if (fileType == FileClassifier.FileType.BINARY) return null;
1631+
try {
1632+
byte[] raw = Files.readAllBytes(absPath);
1633+
String content = DetectorUtils.decodeContent(raw);
1634+
return computeSnippet(content, fileType);
1635+
} catch (IOException e) {
1636+
return null;
1637+
}
1638+
}
1639+
1640+
/**
1641+
* Compute a snippet from file content for storage in H2.
1642+
* Returns the first 200 lines, capped at 10KB, or null for binary files.
1643+
*/
1644+
static String computeSnippet(String content, FileClassifier.FileType fileType) {
1645+
if (fileType == FileClassifier.FileType.BINARY) return null;
1646+
if (content == null || content.isEmpty()) return null;
1647+
// First 200 lines, max 10KB
1648+
String[] lines = content.split("\n", 201);
1649+
StringBuilder sb = new StringBuilder();
1650+
for (int i = 0; i < Math.min(lines.length, 200); i++) {
1651+
if (sb.length() + lines[i].length() > 10_000) break;
1652+
if (i > 0) sb.append('\n');
1653+
sb.append(lines[i]);
1654+
}
1655+
return sb.toString();
1656+
}
1657+
15471658
/**
15481659
* Pre-compile exclude glob patterns into regex Pattern objects.
15491660
*/
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
package io.github.randomcodespace.iq.analyzer;
2+
3+
import java.nio.file.Path;
4+
import java.util.Set;
5+
6+
/**
7+
* Classifies files into categories that determine how they are processed
8+
* in the analysis pipeline.
9+
* <p>
10+
* SOURCE and CONFIG files get full detector treatment; TEST, BINARY,
11+
* GENERATED, and TEXT files get inventory-only nodes (no detectors).
12+
* MINIFIED is detected via a content heuristic after classification.
13+
*/
14+
public final class FileClassifier {
15+
private FileClassifier() {}
16+
17+
public enum FileType {
18+
SOURCE, // Code files with architecture keywords -> full detection
19+
CONFIG, // YAML, JSON, TOML, INI, properties, Dockerfile, etc.
20+
TEST, // Test files -> inventory only, no detectors
21+
GENERATED, // .d.ts, .map, .lock, .generated.*, vendor/, generated/
22+
MINIFIED, // Detected by isMinified() heuristic
23+
TEXT, // Readable but no arch keywords -> inventory + snippet
24+
BINARY // Images, fonts, compiled assets -> inventory only
25+
}
26+
27+
// Test path patterns
28+
private static final Set<String> TEST_DIRS = Set.of(
29+
"test", "tests", "spec", "specs", "__tests__", "__mocks__",
30+
"testing", "testdata", "fixtures", "test-resources", "testFixtures"
31+
);
32+
33+
// Generated directory patterns
34+
private static final Set<String> GENERATED_DIRS = Set.of(
35+
"generated", "gen", "vendor", "third_party", "thirdparty"
36+
);
37+
38+
// Binary extensions
39+
private static final Set<String> BINARY_EXTENSIONS = Set.of(
40+
"png", "jpg", "jpeg", "gif", "bmp", "ico", "svg", "webp",
41+
"woff", "woff2", "ttf", "eot", "otf",
42+
"pdf", "zip", "gz", "tar", "jar", "war", "ear",
43+
"class", "pyc", "pyo", "so", "dll", "exe", "dylib",
44+
"mp3", "mp4", "wav", "avi", "mov",
45+
"sqlite", "db", "mdb"
46+
);
47+
48+
/**
49+
* Classify a file based on its path, filename, and language.
50+
*
51+
* @param relativePath path relative to repository root
52+
* @param language language identifier from FileDiscovery (may be null)
53+
* @return the file type classification
54+
*/
55+
public static FileType classify(Path relativePath, String language) {
56+
String pathStr = relativePath.toString().replace('\\', '/');
57+
String fileName = relativePath.getFileName().toString();
58+
String ext = getExtension(fileName);
59+
60+
// Binary check first
61+
if (BINARY_EXTENSIONS.contains(ext.toLowerCase())) {
62+
return FileType.BINARY;
63+
}
64+
65+
// Generated check
66+
if (isGenerated(pathStr, fileName, ext)) {
67+
return FileType.GENERATED;
68+
}
69+
70+
// Test check
71+
if (isTestFile(pathStr, fileName, language)) {
72+
return FileType.TEST;
73+
}
74+
75+
// Config languages (YAML, JSON, TOML, etc.) are handled as CONFIG
76+
if (isConfigLanguage(language)) {
77+
return FileType.CONFIG;
78+
}
79+
80+
// If it has a known programming language, it's source
81+
if (language != null && !language.isEmpty()) {
82+
return FileType.SOURCE;
83+
}
84+
85+
// Unknown -> TEXT (will be inventory + snippet)
86+
return FileType.TEXT;
87+
}
88+
89+
static boolean isTestFile(String pathStr, String fileName, String language) {
90+
// Check path components
91+
for (String part : pathStr.split("/")) {
92+
if (TEST_DIRS.contains(part)) return true;
93+
}
94+
// Check filename patterns by language
95+
String lower = fileName.toLowerCase();
96+
if (lower.endsWith("test.java") || lower.endsWith("tests.java") ||
97+
lower.endsWith("spec.java") || lower.endsWith("it.java")) return true;
98+
if (lower.endsWith("_test.go")) return true;
99+
if (lower.startsWith("test_") && lower.endsWith(".py")) return true;
100+
if (lower.endsWith("_test.py")) return true;
101+
if (lower.endsWith(".test.ts") || lower.endsWith(".spec.ts") ||
102+
lower.endsWith(".test.js") || lower.endsWith(".spec.js") ||
103+
lower.endsWith(".test.tsx") || lower.endsWith(".spec.tsx") ||
104+
lower.endsWith(".test.jsx") || lower.endsWith(".spec.jsx")) return true;
105+
if (lower.endsWith("test.kt") || lower.endsWith("test.scala") ||
106+
lower.endsWith("spec.scala")) return true;
107+
if (lower.endsWith("_test.rs")) return true;
108+
return false;
109+
}
110+
111+
static boolean isGenerated(String pathStr, String fileName, String ext) {
112+
for (String part : pathStr.split("/")) {
113+
if (GENERATED_DIRS.contains(part)) return true;
114+
}
115+
if (fileName.endsWith(".d.ts") || fileName.endsWith(".js.map") ||
116+
fileName.endsWith(".css.map")) return true;
117+
if (fileName.endsWith(".generated.java") || fileName.endsWith(".generated.ts") ||
118+
fileName.contains("_generated")) return true;
119+
if ("lock".equals(ext) || fileName.equals("package-lock.json") ||
120+
fileName.equals("yarn.lock") || fileName.equals("pnpm-lock.yaml") ||
121+
fileName.equals("Cargo.lock") || fileName.equals("poetry.lock") ||
122+
fileName.equals("Gemfile.lock") || fileName.equals("go.sum")) return true;
123+
return false;
124+
}
125+
126+
static boolean isConfigLanguage(String language) {
127+
if (language == null) return false;
128+
return switch (language) {
129+
case "yaml", "json", "toml", "ini", "properties", "xml",
130+
"dockerfile", "hcl", "bicep", "proto", "graphql" -> true;
131+
default -> false;
132+
};
133+
}
134+
135+
static String getExtension(String fileName) {
136+
int dot = fileName.lastIndexOf('.');
137+
return dot >= 0 ? fileName.substring(dot + 1) : "";
138+
}
139+
}

0 commit comments

Comments
 (0)