@@ -62,94 +62,128 @@ public void deleteById(String id) {
6262 }
6363
6464 /**
65- * Bulk save nodes and edges using Cypher MERGE (bypasses SDN to avoid
66- * duplicate key issues with relationship hydration) .
67- * Nodes are saved first, then edges, to ensure referential integrity .
65+ * Bulk save nodes and edges using UNWIND for batch Cypher inserts.
66+ * Creates an index on CodeNode.id for fast MATCH during edge creation .
67+ * Logs progress every 10K items for visibility on large graphs .
6868 */
6969 public void bulkSave (List <CodeNode > nodes ) {
7070 if (nodes .isEmpty ()) return ;
71+ long start = System .currentTimeMillis ();
7172
72- // Clear existing data
73+ // 1. Clear existing data
74+ log .info ("Neo4j: clearing existing graph..." );
7375 try (Transaction tx = graphDb .beginTx ()) {
76+ // Batch delete to avoid OOM on huge graphs
7477 tx .execute ("MATCH (n) DETACH DELETE n" );
7578 tx .commit ();
7679 }
7780
78- // Save nodes in batches of 500
79- int batchSize = 500 ;
80- for (int i = 0 ; i < nodes .size (); i += batchSize ) {
81- List <CodeNode > batch = nodes .subList (i , Math .min (i + batchSize , nodes .size ()));
81+ // 2. Create index on id property for fast MATCH during edge creation
82+ try (Transaction tx = graphDb .beginTx ()) {
83+ tx .execute ("CREATE INDEX IF NOT EXISTS FOR (n:CodeNode) ON (n.id)" );
84+ tx .commit ();
85+ }
86+
87+ // 3. Save nodes using UNWIND for batch inserts
88+ int batchSize = 2000 ;
89+ int totalNodes = nodes .size ();
90+ log .info ("Neo4j: persisting {} nodes..." , totalNodes );
91+
92+ for (int i = 0 ; i < totalNodes ; i += batchSize ) {
93+ List <CodeNode > batch = nodes .subList (i , Math .min (i + batchSize , totalNodes ));
94+ List <Map <String , Object >> batchProps = new ArrayList <>(batch .size ());
95+ for (CodeNode node : batch ) {
96+ batchProps .add (nodeToProps (node ));
97+ }
8298 try (Transaction tx = graphDb .beginTx ()) {
83- for (CodeNode node : batch ) {
84- Map <String , Object > props = new HashMap <>();
85- props .put ("id" , node .getId ());
86- props .put ("kind" , node .getKind ().getValue ());
87- props .put ("label" , node .getLabel ());
88- if (node .getFqn () != null ) props .put ("fqn" , node .getFqn ());
89- if (node .getModule () != null ) props .put ("module" , node .getModule ());
90- if (node .getFilePath () != null ) props .put ("filePath" , node .getFilePath ());
91- if (node .getLineStart () != null ) props .put ("lineStart" , node .getLineStart ());
92- if (node .getLineEnd () != null ) props .put ("lineEnd" , node .getLineEnd ());
93- if (node .getLayer () != null ) props .put ("layer" , node .getLayer ());
94- if (node .getAnnotations () != null && !node .getAnnotations ().isEmpty ()) {
95- props .put ("annotations" , String .join ("," , node .getAnnotations ()));
96- }
97- // Serialize properties map as individual prefixed keys
98- if (node .getProperties () != null ) {
99- for (var entry : node .getProperties ().entrySet ()) {
100- if (entry .getValue () != null ) {
101- props .put ("prop_" + entry .getKey (), entry .getValue ().toString ());
102- }
103- }
104- }
105- tx .execute ("CREATE (n:CodeNode) SET n = $props" , Map .of ("props" , props ));
106- }
99+ tx .execute ("UNWIND $batch AS props CREATE (n:CodeNode) SET n = props" ,
100+ Map .of ("batch" , batchProps ));
107101 tx .commit ();
108102 }
103+ int done = Math .min (i + batchSize , totalNodes );
104+ if (done % 10000 < batchSize || done == totalNodes ) {
105+ log .info (" nodes: {}/{} ({}%)" , done , totalNodes , 100 * done / totalNodes );
106+ }
109107 }
110108
111- // Build set of all saved node IDs for edge validation
112- Set <String > savedNodeIds = new HashSet <>(nodes . size () );
109+ // 4. Build set of all saved node IDs for edge validation
110+ Set <String > savedNodeIds = new HashSet <>(totalNodes );
113111 for (CodeNode node : nodes ) {
114112 savedNodeIds .add (node .getId ());
115113 }
116114
117- // Save edges (only where both source and target exist in the graph)
115+ // 5. Save edges using UNWIND for batch inserts
118116 List <CodeEdge > allEdges = nodes .stream ()
119117 .flatMap (n -> n .getEdges ().stream ())
120118 .toList ();
119+ int totalEdges = allEdges .size ();
120+ log .info ("Neo4j: persisting {} edges..." , totalEdges );
121+
121122 int created = 0 ;
122123 int skipped = 0 ;
123- for (int i = 0 ; i < allEdges .size (); i += batchSize ) {
124- List <CodeEdge > batch = allEdges .subList (i , Math .min (i + batchSize , allEdges .size ()));
125- try (Transaction tx = graphDb .beginTx ()) {
126- for (CodeEdge edge : batch ) {
127- String sourceId = edge .getSourceId ();
128- String targetId = edge .getTarget () != null ? edge .getTarget ().getId () : null ;
129- if (targetId == null || sourceId == null ) {
130- skipped ++;
131- continue ;
132- }
133- if (!savedNodeIds .contains (sourceId ) || !savedNodeIds .contains (targetId )) {
134- skipped ++;
135- continue ;
136- }
137- Map <String , Object > params = new HashMap <>();
138- params .put ("sourceId" , sourceId );
139- params .put ("targetId" , targetId );
140- params .put ("edgeId" , edge .getId ());
141- params .put ("kind" , edge .getKind ().getValue ());
124+ for (int i = 0 ; i < totalEdges ; i += batchSize ) {
125+ List <CodeEdge > batch = allEdges .subList (i , Math .min (i + batchSize , totalEdges ));
126+ List <Map <String , Object >> edgeBatch = new ArrayList <>(batch .size ());
127+ for (CodeEdge edge : batch ) {
128+ String sourceId = edge .getSourceId ();
129+ String targetId = edge .getTarget () != null ? edge .getTarget ().getId () : null ;
130+ if (targetId == null || sourceId == null
131+ || !savedNodeIds .contains (sourceId ) || !savedNodeIds .contains (targetId )) {
132+ skipped ++;
133+ continue ;
134+ }
135+ edgeBatch .add (Map .of (
136+ "sourceId" , sourceId ,
137+ "targetId" , targetId ,
138+ "edgeId" , edge .getId (),
139+ "kind" , edge .getKind ().getValue ()
140+ ));
141+ created ++;
142+ }
143+ if (!edgeBatch .isEmpty ()) {
144+ try (Transaction tx = graphDb .beginTx ()) {
142145 tx .execute ("""
143- MATCH (s:CodeNode {id: $sourceId}), (t:CodeNode {id: $targetId})
144- CREATE (s)-[:RELATES_TO {id: $edgeId, kind: $kind, sourceId: $sourceId}]->(t)
145- """ , params );
146- created ++;
146+ UNWIND $batch AS e
147+ MATCH (s:CodeNode {id: e.sourceId}), (t:CodeNode {id: e.targetId})
148+ CREATE (s)-[:RELATES_TO {id: e.edgeId, kind: e.kind, sourceId: e.sourceId}]->(t)
149+ """ , Map .of ("batch" , edgeBatch ));
150+ tx .commit ();
151+ }
152+ }
153+ int done = Math .min (i + batchSize , totalEdges );
154+ if (done % 10000 < batchSize || done == totalEdges ) {
155+ log .info (" edges: {}/{} ({}%)" , done , totalEdges , 100 * done / totalEdges );
156+ }
157+ }
158+
159+ long elapsed = System .currentTimeMillis () - start ;
160+ log .info ("Neo4j: bulk save complete — {} nodes, {} edges ({} skipped) in {}s" ,
161+ totalNodes , created , skipped , elapsed / 1000 );
162+ }
163+
164+ /** Convert a CodeNode to a flat property map for Cypher SET. */
165+ private Map <String , Object > nodeToProps (CodeNode node ) {
166+ Map <String , Object > props = new HashMap <>();
167+ props .put ("id" , node .getId ());
168+ props .put ("kind" , node .getKind ().getValue ());
169+ props .put ("label" , node .getLabel ());
170+ if (node .getFqn () != null ) props .put ("fqn" , node .getFqn ());
171+ if (node .getModule () != null ) props .put ("module" , node .getModule ());
172+ if (node .getFilePath () != null ) props .put ("filePath" , node .getFilePath ());
173+ if (node .getLineStart () != null ) props .put ("lineStart" , node .getLineStart ());
174+ if (node .getLineEnd () != null ) props .put ("lineEnd" , node .getLineEnd ());
175+ if (node .getLayer () != null ) props .put ("layer" , node .getLayer ());
176+ if (node .getAnnotations () != null && !node .getAnnotations ().isEmpty ()) {
177+ props .put ("annotations" , String .join ("," , node .getAnnotations ()));
178+ }
179+ if (node .getProperties () != null ) {
180+ for (var entry : node .getProperties ().entrySet ()) {
181+ if (entry .getValue () != null ) {
182+ props .put ("prop_" + entry .getKey (), entry .getValue ().toString ());
147183 }
148- tx .commit ();
149184 }
150185 }
151- log .info ("Edges: {} created, {} skipped (missing source/target node), {} total" ,
152- created , skipped , allEdges .size ());
186+ return props ;
153187 }
154188
155189 // --- Read operations (embedded API, no relationship hydration) ---
0 commit comments