fix: PDF extraction quality — word-boundary checks and layout mode

Adds _text_quality_ok() gate that replaces the bare 50-char length
check at each stage of the extraction fallback chain. Checks:
- Word-boundary ratio (≥60% of tokens must be real words)
- Concatenation ratio (lc→UC transitions must be <10% of word count)

When PyPDF2 default extraction fails quality check, retries with
space_width=100 for tighter word-boundary detection. This fixes
Haynes/workshop manuals where tight kerning produces concatenated
words like 'byMike' and 'oftheGuild'.

Also adds -layout flag to pdftotext subprocess calls for better
spatial awareness in the poppler fallback stage.

Note: PyPDF2 3.0.1 does not support layout=True parameter.
The space_width parameter serves the same purpose.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Matt 2026-05-07 01:36:23 +00:00
commit 83a21854c3

View file

@ -21,6 +21,7 @@ Config: processing.extract_workers, processing.max_pdf_size_mb,
processing.extract_timeout, processing.page_timeout processing.extract_timeout, processing.page_timeout
""" """
import base64 import base64
import re
import json import json
import os import os
import random import random
@ -99,6 +100,40 @@ def _is_transient(error_str):
return any(sig in s for sig in transient_signals) return any(sig in s for sig in transient_signals)
def _text_quality_ok(text, min_length=50):
"""Check if extracted text meets quality thresholds.
Beyond the basic length check, validates:
- Word-boundary ratio: at least 60% of tokens should be real words (2+ alpha chars)
- Concatenation ratio: lowercase-immediately-followed-by-uppercase shouldn't exceed 10% of word count
Returns True if text passes all checks.
"""
text = text.strip()
if len(text) < min_length:
return False
words = text.split()
if not words:
return False
# Word-like ratio: tokens with 2+ alphabetic characters
word_like = sum(1 for w in words if len(re.findall(r'[a-zA-Z]', w)) >= 2)
word_ratio = word_like / len(words)
if word_ratio < 0.60:
return False
# Concatenation detector: lowercase immediately followed by uppercase
# Filter out common camelCase patterns in code (short tokens)
concat_hits = len(re.findall(r'[a-z][A-Z]', text))
concat_ratio = concat_hits / len(words) if words else 0
if concat_ratio > 0.10:
return False
return True
def _render_page_to_png(pdf_path, page_num_1indexed, dpi=200, timeout=30): def _render_page_to_png(pdf_path, page_num_1indexed, dpi=200, timeout=30):
"""Render a single PDF page to PNG bytes using pdftoppm. """Render a single PDF page to PNG bytes using pdftoppm.
@ -224,7 +259,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
# Method 1: pdftotext (poppler) # Method 1: pdftotext (poppler)
try: try:
result = subprocess.run( result = subprocess.run(
['pdftotext', '-f', str(page_num_0indexed + 1), ['pdftotext', '-layout', '-f', str(page_num_0indexed + 1),
'-l', str(page_num_0indexed + 1), pdf_path, '-'], '-l', str(page_num_0indexed + 1), pdf_path, '-'],
capture_output=True, text=True, timeout=page_timeout capture_output=True, text=True, timeout=page_timeout
) )
@ -233,7 +268,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
except Exception: except Exception:
pass pass
if len(text.strip()) >= 50: if _text_quality_ok(text):
return text, 'pdftotext' return text, 'pdftotext'
# Method 2: pdftoppm + Tesseract OCR # Method 2: pdftoppm + Tesseract OCR
@ -258,7 +293,7 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
except Exception: except Exception:
pass pass
if len(text.strip()) >= 50: if _text_quality_ok(text):
return text, 'tesseract' return text, 'tesseract'
# Method 3: Gemini Vision (last resort) # Method 3: Gemini Vision (last resort)
@ -276,8 +311,26 @@ def _extract_page_without_reader(pdf_path, page_num_0indexed, page_timeout=30):
# ── Core extraction functions ── # ── Core extraction functions ──
def _pypdf2_extract(reader, page_num): def _pypdf2_extract(reader, page_num):
"""Extract text from a PyPDF2 page object. Runs inside a thread for timeout.""" """Extract text from a PyPDF2 page object. Runs inside a thread for timeout.
return reader.pages[page_num].extract_text() or ''
Tries default extraction first (space_width=200). If quality check fails,
retries with space_width=100 which better detects word boundaries in
tightly-kerned PDFs (common in Haynes/workshop manuals).
Note: PyPDF2 3.0.1 does not support layout=True. The space_width parameter
controls word-boundary detection tolerance. Lower values = more aggressive
space insertion between characters.
"""
text = reader.pages[page_num].extract_text() or ''
if _text_quality_ok(text):
return text
# Retry with tighter word-boundary detection
text_tight = reader.pages[page_num].extract_text(space_width=100.0) or ''
if len(text_tight.strip()) >= len(text.strip()):
return text_tight
return text
def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30): def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
@ -302,13 +355,13 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
except Exception: except Exception:
text = '' text = ''
if len(text.strip()) >= 50: if _text_quality_ok(text):
return text, 'pypdf2' return text, 'pypdf2'
# Method 2: pdftotext via subprocess (inherently timeout-safe) # Method 2: pdftotext via subprocess (inherently timeout-safe)
try: try:
result = subprocess.run( result = subprocess.run(
['pdftotext', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'], ['pdftotext', '-layout', '-f', str(page_num + 1), '-l', str(page_num + 1), pdf_path, '-'],
capture_output=True, text=True, timeout=page_timeout capture_output=True, text=True, timeout=page_timeout
) )
if result.returncode == 0 and len(result.stdout.strip()) > len(text.strip()): if result.returncode == 0 and len(result.stdout.strip()) > len(text.strip()):
@ -316,7 +369,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
except Exception: except Exception:
pass pass
if len(text.strip()) >= 50: if _text_quality_ok(text):
return text, 'pdftotext' return text, 'pdftotext'
# Method 3: pdftoppm + Tesseract OCR # Method 3: pdftoppm + Tesseract OCR
@ -340,7 +393,7 @@ def extract_text_from_page(reader, page_num, pdf_path, page_timeout=30):
except Exception: except Exception:
pass pass
if len(text.strip()) >= 50: if _text_quality_ok(text):
return text, 'tesseract' return text, 'tesseract'
# Method 4: Gemini Vision (last resort — costs API calls but handles scanned docs) # Method 4: Gemini Vision (last resort — costs API calls but handles scanned docs)