Chapter Splitting Architecture
Goal: Split each curriculum PDF into chapter/lesson-level units before ingestion, so that retrieval operates at the correct granularity.
Overview
PDF Upload / Download
│
▼
┌─────────────────────────────────────────────────────────────┐
│ ChapterSplitter │
│ │
│ Layer 1: TOC Extraction │
│ └── Parse machine-readable TOC (pages 1-20) │
│ ▼ failure │
│ Layer 2: Header Scan │
│ └── Regex scan all pages for chapter headers │
│ ▼ failure │
│ Layer 3: Heuristic Split │
│ └── Equal-size page ranges (N pages / M chapters) │
│ ▼ failure │
│ Layer 4: Full Document │
│ └── Treat whole PDF as one chapter; flag for review │
└─────────────────────────────────────────────────────────────┘
│
▼
For each chapter:
├── Validate: does stated page actually contain header?
├── Strip running headers/footers
├── Create chapter PDF (page range extraction)
├── Create reference record in DB
└── Call existing IngestionService.execute_job()
Language-Specific Pattern Registry
Each language has its own extraction patterns registered in ChapterSplitter:
PATTERNS = {
"fr": TOCPattern(
toc_header=["Table des matières", "SOMMAIRE"],
chapter_header=r"CHAPITRE\s*(\d+)",
page_number_in_toc=r"\.{2,}\s*(\d+)\s*$",
toc_position="front", # Almost always within first 10 pages
rtl=False,
remove_diacritics=False,
),
"en": TOCPattern(
toc_header=["Table of contents", "Contents"],
chapter_header=r"UNIT\s+([IVX\d]+|ONE|TWO|THREE|FOUR|FIVE)",
lesson_header=r"Lesson\s+\d+",
page_number_in_toc=r"\.{2,}\s*(\d+)\s*$",
toc_position="front",
rtl=False,
remove_diacritics=False,
),
"ar": TOCPattern(
toc_header=["الفهرس", "جدول المحتويات", "محتويات"],
chapter_header=r"الدرس\s*(\d+)",
unit_header=r"الوحدة\s*(الأولى|الثانية|الثالثة|\d+)",
page_number_in_toc=r"(\d+)\s*-$",
toc_position="any", # Can be at end (page 95) or beginning
rtl=True,
remove_diacritics=True, # Strip tashkeel for cleaner text
),
}
Layer 1: TOC Extraction
French
TOC page (e.g., page 5):
CHAPITRE 1
NOMBRES REELS ET OPERATIONS…………...………………………………7
CHAPITRE 2
ORDRE, INTERVALLES ET VALEURS ABSOLUE…………………………….21
Regex: CHAPITRE\s*(\d+)[^\d]*(\d+)
Result: [(1, "NOMBRES REELS ET OPERATIONS", 7), (2, "ORDRE...", 21)]
English
TOC page (e.g., page 7):
UNIT ONE Lesson1 Hello, students! 9
UNIT ONE Lesson2 Goodbye, bye-bye! 11
Regex: UNIT\s+([A-Z]+)\s+Lesson\s*(\d+)[^\d]*(\d+)
Result: [(1, "Hello, students!", 9), (1, "Goodbye, bye-bye!", 11)]
Arabic
TOC page (e.g., page 5):
1945العالقات الدولية بعد الوحدة األوىل 7
الدرس 1: هيئة الأمم المتحدة 8
Regex for units: الوحدة\s*(الأولى|الثانية|الثالثة|\d+)[^\d]*(\d+)
Regex for lessons: الدرس\s*(\d+)[^\d]*(\d+)
Layer 2: Header Scan
If TOC extraction fails, scan all pages for structural headers:
def scan_headers(pdf_path: str, patterns: TOCPattern) -> list[ChapterEntry]:
chapters = []
doc = fitz.open(pdf_path)
for page_num, page in enumerate(doc):
text = page.get_text()
for pattern in patterns.chapter_header:
match = re.search(pattern, text)
if match:
chapter_num = match.group(1)
# Extract title from surrounding context
title = extract_title_context(text, match)
chapters.append(ChapterEntry(
title=title,
number=chapter_num,
page_start=page_num + 1, # 1-indexed
source="header_scan",
))
# Deduplicate and sort by page
chapters.sort(key=lambda c: c.page_start)
return chapters
Layer 3: Heuristic Split
If neither TOC nor headers found:
def heuristic_split(total_pages: int, inferred_chapter_count: int) -> list[ChapterEntry]:
"""
Divide document into equal-size page ranges.
Used when no structural headers are detectable.
"""
pages_per_chapter = total_pages // inferred_chapter_count
chapters = []
for i in range(inferred_chapter_count):
page_start = i * pages_per_chapter + 1
page_end = (i + 1) * pages_per_chapter if i < inferred_chapter_count - 1 else total_pages
chapters.append(ChapterEntry(
title=f"Chapter {i+1}",
number=str(i + 1),
page_start=page_start,
page_end=page_end,
source="heuristic",
confidence=0.3,
))
return chapters
Page Verification
After extracting chapter boundaries from TOC or headers, always verify:
def verify_chapter_page(entry: ChapterEntry, doc: fitz.Document) -> ChapterEntry:
"""
Check that the stated page actually contains the chapter header.
If not, try ±1, ±2 pages.
"""
for offset in [0, 1, -1, 2, -2]:
check_page = entry.page_start + offset - 1 # 0-indexed
if 0 <= check_page < len(doc):
text = doc[check_page].get_text()
if header_found_in_text(text, entry):
entry.page_start = check_page + 1
entry.verified = True
entry.page_offset = offset
return entry
entry.verified = False
entry.needs_review = True
return entry
Noise Phrase Filtering
Running headers and footers appear on every page and must be stripped:
NOISE_PHRASES = {
"fr": ["IPN"],
"en": ["IPN"],
"ar": ["المعهد التربوي الوطني", "IPN"],
}
def strip_noise(text: str, language: str) -> str:
for phrase in NOISE_PHRASES.get(language, []):
text = text.replace(phrase, "")
# Also strip page numbers (standalone digits on their own line)
text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
return text.strip()
Confidence Scoring
Each extracted chapter carries a confidence score:
| Scenario | Confidence |
|---|---|
| TOC page number matches actual page + header text matches | 1.0 |
| TOC page number matches, header text differs slightly | 0.7 |
| Header scan found, no TOC | 0.6 |
| Heuristic split (equal ranges) | 0.3 |
| Full document fallback | 0.1 |
@dataclass
class ChapterEntry:
title: str
number: str
page_start: int
page_end: int
actual_page_start: int | None
page_verified: bool
parsing_method: str # "toc" | "header_scan" | "heuristic" | "full_doc"
confidence: float
needs_review: bool
chapter_type: str # "chapter" | "lesson" | "unit" | "surah"
unit_title: str | None # For lessons within a unit
unit_number: str | None
ChapterSplitResult
@dataclass
class ChapterSplitResult:
status: str # "success" | "partial" | "inferred" | "failed"
parsing_method: str | None
total_chapters: int
failed_entries: list[str]
confidence: float
chapters: list[ChapterEntry]
Database Schema
CREATE TABLE chapter_splits (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
reference_id UUID NOT NULL REFERENCES references(id),
parent_document_id UUID REFERENCES documents(id),
chapter_title TEXT NOT NULL,
chapter_number TEXT,
chapter_type TEXT DEFAULT 'chapter',
unit_title TEXT,
unit_number TEXT,
language TEXT NOT NULL,
page_start INTEGER NOT NULL,
page_end INTEGER NOT NULL,
parsing_method TEXT NOT NULL,
confidence FLOAT NOT NULL DEFAULT 1.0,
needs_review BOOLEAN DEFAULT FALSE,
ingestion_job_id UUID REFERENCES ingestion_jobs(id),
ingestion_status TEXT DEFAULT 'pending',
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX idx_chapter_splits_reference ON chapter_splits(reference_id);
CREATE INDEX idx_chapter_splits_language ON chapter_splits(language);
CREATE INDEX idx_chapter_splits_needs_review ON chapter_splits(needs_review)
WHERE needs_review = TRUE;
ALTER TABLE references ADD COLUMN split_status TEXT DEFAULT 'not_applicable';
ALTER TABLE references ADD COLUMN total_chapters INTEGER;
API Interface
class ChapterSplitter:
def __init__(self, supabase: Client):
self.supabase = supabase
def split(
self,
pdf_bytes: bytes,
language_hint: str | None = None,
expected_chapter_count: int | None = None,
) -> ChapterSplitResult:
"""
Main entry point.
Args:
pdf_bytes: Raw PDF file bytes
language_hint: "fr" | "en" | "ar" | None (auto-detect)
expected_chapter_count: If known, used for heuristic fallback
Returns:
ChapterSplitResult with all found chapters and metadata
"""
def preview(
self,
pdf_bytes: bytes,
language_hint: str | None = None,
) -> ChapterSplitResult:
"""
Dry-run mode: extract chapters without writing to DB.
Use for human review before committing to ingestion.
"""
def ingest(
self,
pdf_bytes: bytes,
reference_id: UUID,
language_hint: str | None = None,
) -> list[dict]:
"""
Full pipeline: split + create reference records + trigger ingestion.
Returns list of created chapter_split records.
"""
Implementation Location
app/
services/
chapter_splitter.py # NEW: ChapterSplitter class
asset_extractor.py # NEW: AssetExtractor (see asset-extraction.md)
No changes to existing IngestionService or retrieval_pipeline.py.