Asset Extraction Architecture

Goal: Extract visual content (images, tables, diagrams) from curriculum PDFs, preserve images for display, and generate searchable text descriptions for retrieval.

Overview

Chapter PDF (from ChapterSplitter)
         │
         ▼
┌─────────────────────────────────────────────────────────────┐
│                    AssetExtractor                            │
│                                                             │
│  1. Extract images with bounding boxes                      │
│  2. Extract tables with structure                           │
│  3. Extract text blocks with positions                      │
│  4. Extract vector drawings (graphs, diagrams)             │
└─────────────────────────────────────────────────────────────┘
         │
         ▼
┌─────────────────────────────────────────────────────────────┐
│                  ContentClassifier                           │
│                                                             │
│  Is this image content-relevant or decorative?              │
│  Signals: size, OCR text, caption proximity, page type     │
└─────────────────────────────────────────────────────────────┘
         │
         ▼
┌─────────────────────────────────────────────────────────────┐
│                   ImageDescriber                            │
│                                                             │
│  For content-relevant images only:                         │
│  - OCR: extract visible text                               │
│  - Vision model: semantic description + alt text            │
│  - Search keywords: boost text searchability               │
└─────────────────────────────────────────────────────────────┘
         │
         ▼
         │
    ┌────┴────┐
    │         │
Text Chunks  CurriculumAssets
 (embedded)   (stored)

Content Type Taxonomy

Every chunk and asset carries a content_type:

class ContentType:
    TEXT = "text"              # Pure prose text
    IMAGE = "image"            # Visual content - must be preserved
    TABLE = "table"            # Structured tabular data
    COMPOSITE = "composite"    # Mix of text + image on same page
    DECORATIVE = "decorative"  # Borders, logos, page numbers - skip

AssetExtractor

class ExtractedImage(NamedTuple):
    image_id: str
    bytes: bytes
    width: int
    height: int
    bbox: tuple[float, float, float, float]  # x0, y0, x1, y1
    page_num: int
    ocr_text: str | None
    extraction_method: str  # "native" | "vector"

class ExtractedTable(NamedTuple):
    table_id: str
    markdown: str              # Structured text representation
    bbox: tuple[float, float, float, float]
    page_num: int
    row_count: int
    col_count: int

class ExtractedTextBlock(NamedTuple):
    text: str
    bbox: tuple[float, float, float, float]
    page_num: int
    is_heading: bool
    is_caption: bool

class AssetExtractionResult(NamedTuple):
    images: list[ExtractedImage]
    tables: list[ExtractedTable]
    text_blocks: list[ExtractedTextBlock]
    drawings: list  # Vector graphics
    total_pages: int

class AssetExtractor:
    def __init__(self):
        self.noise_phrases = {
            "ar": ["المعهد التربوي الوطني", "IPN"],
            "fr": ["IPN"],
            "en": ["IPN"],
        }

    def extract(self, pdf_bytes: bytes) -> AssetExtractionResult:
        doc = fitz.open_from_bytes(pdf_bytes)

        images = []
        tables = []
        text_blocks = []
        drawings = []

        for page_num, page in enumerate(doc):
            # Text blocks with positions
            blocks = page.get_text("dict")["blocks"]
            for block in blocks:
                if block["type"] == 0:  # text block
                    text = block["text"].strip()
                    if text:
                        bbox = block["bbox"]
                        is_heading = self._is_heading(text)
                        is_caption = self._is_caption(text, blocks)
                        text_blocks.append(ExtractedTextBlock(
                            text=text,
                            bbox=bbox,
                            page_num=page_num + 1,
                            is_heading=is_heading,
                            is_caption=is_caption,
                        ))

                elif block["type"] == 1:  # image block
                    img = self._extract_image(block, page_num + 1)
                    if img:
                        images.append(img)

            # Vector drawings (graphs, diagrams)
            paths = page.get_drawings()
            if paths:
                drawings.extend(self._extract_drawings(paths, page_num + 1))

            # Tables
            table_list = page.find_tables()
            for table in table_list:
                tbl = self._extract_table(table, page_num + 1)
                if tbl:
                    tables.append(tbl)

        return AssetExtractionResult(
            images=images,
            tables=tables,
            text_blocks=text_blocks,
            drawings=drawings,
            total_pages=len(doc),
        )

    def _extract_image(self, block: dict, page_num: int) -> ExtractedImage | None:
        """Extract image bytes and metadata from a image block."""
        # Get image xref
        img_xref = block.get("xref")
        if not img_xref:
            return None

        try:
            base_image = doc.extract_image(img_xref)
            return ExtractedImage(
                image_id=str(img_xref),
                bytes=base_image["image"],
                width=base_image.get("width", 0),
                height=base_image.get("height", 0),
                bbox=block["bbox"],
                page_num=page_num,
                ocr_text=None,  # Done separately by OCR pass
                extraction_method="native",
            )
        except Exception:
            return None

    def _is_heading(self, text: str) -> bool:
        """Heuristic: short text, large font, all caps or Arabic title pattern."""
        if len(text) > 100:
            return False
        # Could enhance with font size analysis from block dict
        return text.isupper() or bool(re.match(r'^الدرس\s*\d+', text))

    def _is_caption(self, text: str, all_blocks: list) -> bool:
        """Heuristic: text near an image block, short length."""
        return len(text) < 200 and ("شكل" in text or "صورة" in text or 
                                     "Figure" in text or "ص" in text)

ContentClassifier

Determines whether an image is content-relevant or decorative.

class ContentClassifier:
    DECORATIVE_SIGNALS = [
        "ipn", "logo", "copyright",
    ]

    MIN_CONTENT_AREA = 50_000   # px² - images smaller than this are likely icons
    MEDIUM_CONTENT_AREA = 10_000

    def is_content_image(self, img: ExtractedImage, page_context: str) -> tuple[bool, float]:
        """
        Returns (is_content, confidence_score).

        Confidence scoring:
        - Has substantial OCR text (>10 chars): +0.8
        - Large image area (>50k px²): +0.6
        - Medium image area (10k-50k px²): +0.3
        - Has nearby caption: +0.7
        - Matches decorative pattern: -0.9
        """
        score = 0.0
        signals = {}

        # Signal 1: OCR text content
        if img.ocr_text and len(img.ocr_text.strip()) > 10:
            score += 0.8
            signals["has_ocr_text"] = True

        # Signal 2: Image size
        area = img.width * img.height
        if area > self.MIN_CONTENT_AREA:
            score += 0.6
            signals["large"] = True
        elif area > self.MEDIUM_CONTENT_AREA:
            score += 0.3
            signals["medium"] = True
        else:
            signals["small_icon"] = True
            score -= 0.2

        # Signal 3: Caption nearby
        if page_context and self._caption_near_bbox(img.bbox, page_context):
            score += 0.7
            signals["has_caption"] = True

        # Signal 4: Decorative pattern match
        if img.ocr_text:
            ocr_lower = img.ocr_text.lower()
            for pattern in self.DECORATIVE_SIGNALS:
                if pattern in ocr_lower:
                    score -= 0.9
                    signals["decorative_match"] = pattern

        # Signal 5: Is on illustration-heavy page
        # (pages with many images, few paragraphs)

        is_content = score > 0.3
        return is_content, max(0.0, min(1.0, score))

    def _caption_near_bbox(self, bbox: tuple, context: str) -> bool:
        """Check if a caption-like text appears near the image bbox."""
        caption_keywords = ["شكل", "صورة", "Figure", "fig", "انظر", "leigh"]
        return any(kw in context for kw in caption_keywords)

ImageDescriber

Generates semantic descriptions for content-relevant images using a vision model.

class ImageDescription(NamedTuple):
    asset_id: str
    ocr_text: str               # Raw OCR from image
    semantic_description: str    # What the image shows
    alt_text: str               # For frontend display
    search_keywords: list[str]   # Boost searchability

class ImageDescriber:
    def __init__(self, openai_client: OpenAI):
        self.client = openai_client

    def describe(
        self, 
        img: ExtractedImage, 
        chapter_context: str,  # Surrounding text from the page/chapter
        language: str,
    ) -> ImageDescription:
        """
        Use GPT-4o (vision) to generate:
        1. OCR of any text in the image
        2. Semantic description
        3. Alt text for frontend
        4. Search keywords
        """
        # Encode image to base64
        img_b64 = base64.b64encode(img.bytes).decode()

        prompt = self._build_prompt(chapter_context, language)

        response = self.client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{img_b64}",
                            },
                        },
                    ],
                }
            ],
            max_tokens=500,
        )

        return self._parse_response(response, img.image_id)

    def _build_prompt(self, context: str, language: str) -> str:
        lang_instruction = {
            "ar": "Respond in Arabic where possible.",
            "fr": "Respond in French.",
            "en": "Respond in English.",
        }.get(language, "Respond in English.")

        return f"""
This image appears in an educational textbook.
Context from the surrounding page/chapter:
\"\"\"{context[:500]}\"\"\"

{lang_instruction}

Please analyze this image and provide:
1. OCR_TEXT: Any text visible in the image (transcribe exactly)
2. SEMANTIC_DESCRIPTION: What does this image show? Be specific about labels, arrows, diagram components.
3. ALT_TEXT: A short (1-2 sentence) description suitable for alt text / screen readers.
4. SEARCH_KEYWORDS: 5-10 keywords that would help a student find this image when searching.

Format your response as:
OCR_TEXT: <text>
SEMANTIC_DESCRIPTION: <description>
ALT_TEXT: <alt text>
SEARCH_KEYWORDS: <comma-separated keywords>
"""

TableExtractor

Converts PDF table structures to structured markdown.

class TableExtractor:
    def extract_table(self, table, page_num: int) -> ExtractedTable:
        """Convert a PyMuPDF table to structured markdown."""
        rows = table.extract()

        if not rows:
            return None

        # Clean cells
        cleaned_rows = []
        for row in rows:
            cleaned = [cell.strip().replace("\n", " ") for cell in row]
            cleaned_rows.append(cleaned)

        # Build markdown
        header = cleaned_rows[0]
        separator = "| " + " | ".join(["---"] * len(header)) + " |"

        md_rows = ["| " + " | ".join(header) + " |"]
        md_rows.append(separator)
        for row in cleaned_rows[1:]:
            md_rows.append("| " + " | ".join(row) + " |")

        markdown = "\n".join(md_rows)

        return ExtractedTable(
            table_id=f"table_{page_num}_{hash(markdown) % 10000}",
            markdown=markdown,
            bbox=table.bbox,
            page_num=page_num,
            row_count=len(cleaned_rows),
            col_count=len(header) if header else 0,
        )

Database Schema

CREATE TABLE curriculum_assets (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),

    -- Link to chapter
    chapter_split_id UUID REFERENCES chapter_splits(id),
    document_id UUID REFERENCES documents(id),

    -- Asset metadata
    asset_type TEXT NOT NULL,          -- "image" | "table" | "drawing"
    content_type TEXT NOT NULL,         -- "decorative" | "content" | "composite"

    -- Image data
    image_bytes BYTEA,
    image_mime_type TEXT DEFAULT 'image/png',
    image_width INTEGER,
    image_height INTEGER,

    -- Text representation
    ocr_text TEXT,
    semantic_description TEXT,
    alt_text TEXT,
    search_text TEXT,

    -- Position
    page_number INTEGER NOT NULL,
    bounding_box JSONB,               -- {"x0": 100, "y0": 200, "x1": 400, "y1": 600}

    -- Linkage
    linked_chunk_ids UUID[],

    -- Quality signals
    confidence FLOAT DEFAULT 1.0,
    needs_review BOOLEAN DEFAULT FALSE,

    created_at TIMESTAMPTZ DEFAULT NOW()
);

-- Extend chunks table
ALTER TABLE chunks ADD COLUMN content_type TEXT DEFAULT 'text';
ALTER TABLE chunks ADD COLUMN is_visual_reference BOOLEAN DEFAULT FALSE;
ALTER TABLE chunks ADD COLUMN linked_asset_ids UUID[];
ALTER TABLE chunks ADD COLUMN page_coordinates JSONB;

Integration with Ingestion Pipeline

ChapterSplitter.split()
         │
         ▼
┌─────────────────────────────────────────────────────────────┐
│                 AssetExtractor.extract()                      │
│  Returns: images, tables, text_blocks                       │
└─────────────────────────────────────────────────────────────┘
         │
         ├──► ContentClassifier.is_content_image()
         │         │
         │    ┌────┴────┐
         │    │         │
         │  Content  Decorative
         │    │         │
         │    ▼         ▼
         │ ImageDescriber  → Skip or low-priority store
         │    │
         │    ▼
         │  Store in curriculum_assets
         │
         ├──► TableExtractor
         │         │
         │         ▼
         │    Convert to markdown
         │    Treat as text chunk
         │
         └──► Text blocks
                   │
                   ▼
            Clean + Chunk normally
                   │
                   ▼
            Embed + store with linked_asset_ids

Retrieval: Returning Images with Text

Update RetrievalResult to include linked assets:

@dataclass
class RetrievalResult:
    chunk_id: str
    text: str
    content_type: str
    score: float

    # Visual content
    linked_assets: list[dict] | None
    # [
    #   {
    #     "asset_id": "...",
    #     "asset_type": "image",
    #     "image_url": "...",      # Presigned URL or base64
    #     "alt_text": "...",
    #     "semantic_description": "...",
    #     "ocr_text": "...",
    #   }
    # ]

    chapter_title: str
    lesson_title: str
    page_range: str

Visual Query Boosting

When a query contains visual keywords, boost chunks with linked image assets:

VISUAL_KEYWORDS = {
    "en": ["picture", "photo", "graph", "diagram", "figure", "image", "show"],
    "fr": ["image", "photo", "graphique", "diagramme", "figure"],
    "ar": ["صورة", "رسم", "شكل", "صورة", "graphique"],
}

def is_visual_query(query: str) -> bool:
    query_lower = query.lower()
    all_keywords = [kw for kws in VISUAL_KEYWORDS.values() for kw in kws]
    return any(kw in query_lower for kw in all_keywords)

# In RetrievalPipeline.retrieve():
if is_visual_query(query):
    for chunk in results:
        if chunk.get("linked_asset_ids"):
            chunk["score"] *= 1.2   # Boost visual content

Implementation Location

app/
  services/
    chapter_splitter.py      # From chapter-splitting.md
    asset_extractor.py       # NEW: AssetExtractor + ContentClassifier
    image_describer.py       # NEW: ImageDescriber (uses OpenAI GPT-4o vision)
    table_extractor.py       # NEW: TableExtractor

External Dependencies

PyMuPDF (fitz) — already in use for PDF processing
OpenAI GPT-4o — for vision-based image description (existing OpenAI client can be reused)
No new infrastructure dependencies