Increase concurrency and cache limits for AI moderation

Bentlybro · Bentlybro · commit 464fb0e02b97 · 2025-10-22T15:47:25.000+01:00
Raised thread pool worker limits from 10 to 50 in AI moderation and rule processing for improved parallelism under load. Increased max cache size in result_cache.py from 10k to 50k entries to handle higher concurrency. Updated OpenAI client connection pool and limits for better burst handling. Increased max chunk size for content processing to 150k characters for efficiency with large context models.
diff --git a/app/services/ai/ai_moderator.py b/app/services/ai/ai_moderator.py
@@ -308,8 +308,9 @@ def moderate_content(self, content, content_type='text', custom_prompt=None):
             if custom_prompt:
                 # CRITICAL FIX: Hard limit on character count (tiktoken is broken for some content)
                 # Assume worst case: 1 char = 1 token for safety
-                # Increased from 50k to 100k for better performance (fewer chunks = faster)
-                MAX_CHARS_PER_CHUNK = 100000  # ~100k tokens worst case, safe for large context models
+                # Increased from 50k -> 100k -> 150k for better performance (fewer chunks = faster)
+                # With 400k context window and 70% safety margin, 150k is safe
+                MAX_CHARS_PER_CHUNK = 150000  # ~150k tokens worst case, safe for large context models
                 content_chars = len(content)
 
                 # Force chunking if content is too large BY CHARACTER COUNT
@@ -324,8 +325,9 @@ def moderate_content(self, content, content_type='text', custom_prompt=None):
                         chunks.append(content[i:i + MAX_CHARS_PER_CHUNK])
 
                     # Process all chunks IN PARALLEL for maximum speed
+                    # Increased from 10 to 50 workers for better concurrency under load
                     chunk_results = []
-                    with ThreadPoolExecutor(max_workers=min(len(chunks), 10)) as executor:
+                    with ThreadPoolExecutor(max_workers=min(len(chunks), 50)) as executor:
                         # Submit all chunks at once with context wrapper
                         future_to_chunk = {
                             executor.submit(self._context_wrapper, self._analyze_with_custom_prompt, chunk, custom_prompt): i
@@ -373,8 +375,9 @@ def moderate_content(self, content, content_type='text', custom_prompt=None):
                 chunks = self.split_text_into_chunks(content, max_content_tokens)
 
                 # Process all chunks IN PARALLEL for maximum speed
+                # Increased from 10 to 50 workers for better concurrency under load
                 chunk_results = []
-                with ThreadPoolExecutor(max_workers=min(len(chunks), 10)) as executor:
+                with ThreadPoolExecutor(max_workers=min(len(chunks), 50)) as executor:
                     # Submit all chunks at once with context wrapper
                     future_to_chunk = {
                         executor.submit(self._context_wrapper, self._run_enhanced_default_moderation, chunk): i
diff --git a/app/services/ai/openai_client.py b/app/services/ai/openai_client.py
@@ -37,9 +37,9 @@ def _get_or_create_client(cls, api_key):
                     pool=2.0
                 ),
                 limits=httpx.Limits(
-                    max_keepalive_connections=200,  # Increased connection pool
-                    max_connections=1000,           # Increased total connections
-                    keepalive_expiry=300.0          # Longer keepalive
+                    max_keepalive_connections=500,  # Increased from 200 to 500 for high concurrency
+                    max_connections=2000,           # Increased from 1000 to 2000 for burst handling
+                    keepalive_expiry=300.0          # Longer keepalive (5 minutes)
                 ),
                 http2=True
             )
diff --git a/app/services/ai/result_cache.py b/app/services/ai/result_cache.py
@@ -13,8 +13,8 @@ class ResultCache:
     _shared_cache_ttl = 3600  # 1 hour
     _current_request_stores = 0  # Track stores per request
     _cache_lock = RLock()  # Thread-safe operations
-    _max_cache_size = 10000  # Maximum cache entries (increased to handle high volume)
-    _cleanup_threshold = 9000  # Start cleanup when reaching 90% capacity
+    _max_cache_size = 50000  # Maximum cache entries (increased from 10k to 50k for high concurrency)
+    _cleanup_threshold = 45000  # Start cleanup when reaching 90% capacity
     _last_cleanup_time = 0
     _cleanup_interval = 900  # Check for expired entries every 15 minutes
 
diff --git a/app/services/moderation/rule_processor.py b/app/services/moderation/rule_processor.py
@@ -103,7 +103,8 @@ def process_single_ai_rule(rule):
                 return (rule.id, None)
 
         # Execute in parallel
-        with ThreadPoolExecutor(max_workers=min(len(ai_rules), 10)) as executor:
+        # Increased from 10 to 50 workers for better concurrency under load
+        with ThreadPoolExecutor(max_workers=min(len(ai_rules), 50)) as executor:
             futures = {executor.submit(
                 process_single_ai_rule, rule): rule for rule in ai_rules}