Skip to content

Commit b72d7ee

Browse files
committed
Enforce character-based chunking for large custom prompts
Adds a hard character count limit (50,000 chars) for content analyzed with custom prompts to address issues with inaccurate token counting. Content exceeding this limit is split by character count to ensure safe processing, improving reliability when tiktoken misestimates token usage.
1 parent 9775c63 commit b72d7ee

1 file changed

Lines changed: 22 additions & 12 deletions

File tree

app/services/ai/ai_moderator.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -295,30 +295,40 @@ def moderate_content(self, content, content_type='text', custom_prompt=None):
295295

296296
# STEP 1: If custom prompt is provided, use ONLY custom prompt analysis
297297
if custom_prompt:
298-
# Calculate max content tokens based on custom prompt size
299-
max_content_tokens = self.calculate_max_content_tokens(custom_prompt)
298+
# CRITICAL FIX: Hard limit on character count (tiktoken is broken for some content)
299+
# Assume worst case: 1 char = 1 token for safety
300+
MAX_CHARS_PER_CHUNK = 50000 # ~50k tokens worst case, safe for any prompt
301+
content_chars = len(content)
302+
300303
current_app.logger.info(
301-
f"Content has {content_tokens} tokens, max allowed: {max_content_tokens}")
304+
f"Content: {content_tokens} tokens (tiktoken), {content_chars} chars")
305+
306+
# Force chunking if content is too large BY CHARACTER COUNT
307+
if content_chars > MAX_CHARS_PER_CHUNK:
308+
current_app.logger.warning(
309+
f"FORCING CHUNKING: Content too large ({content_chars} chars > {MAX_CHARS_PER_CHUNK})")
310+
311+
# Split by character count, not tokens
312+
chunks = []
313+
for i in range(0, content_chars, MAX_CHARS_PER_CHUNK):
314+
chunks.append(content[i:i + MAX_CHARS_PER_CHUNK])
302315

303-
if content_tokens <= max_content_tokens:
304-
return self._analyze_with_custom_prompt(content, custom_prompt)
305-
else:
306-
# Split content and analyze each chunk
307-
chunks = self.split_text_into_chunks(content, max_content_tokens)
308316
current_app.logger.info(
309-
f"Split content into {len(chunks)} chunks for custom prompt analysis")
317+
f"Split content into {len(chunks)} chunks by character count")
310318

311319
chunk_results = []
312320
for i, chunk in enumerate(chunks):
313-
result = self._analyze_with_custom_prompt(
314-
chunk, custom_prompt)
321+
result = self._analyze_with_custom_prompt(chunk, custom_prompt)
315322
chunk_results.append(result)
316323

317-
# Early exit if chunk is rejected (for efficiency)
324+
# Early exit if chunk is rejected
318325
if result['decision'] == 'rejected':
319326
break
320327

321328
return self._combine_chunk_results(chunk_results, len(content))
329+
else:
330+
# Content is small enough, process normally
331+
return self._analyze_with_custom_prompt(content, custom_prompt)
322332

323333
# STEP 2: For default moderation, run baseline check first
324334
# Note: OpenAI moderation API has its own limits, but typically handles larger content

0 commit comments

Comments
 (0)