Asset Extraction Architecture
Goal: Extract visual content (images, tables, diagrams) from curriculum PDFs, preserve images for display, and generate searchable text descriptions for retrieval.
Overview
Chapter PDF (from ChapterSplitter)
│
▼
┌─────────────────────────────────────────────────────────────┐
│ AssetExtractor │
│ │
│ 1. Extract images with bounding boxes │
│ 2. Extract tables with structure │
│ 3. Extract text blocks with positions │
│ 4. Extract vector drawings (graphs, diagrams) │
└─────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ ContentClassifier │
│ │
│ Is this image content-relevant or decorative? │
│ Signals: size, OCR text, caption proximity, page type │
└─────────────────────────────────────────────────────────────┘
│
▼
┌─────────────────────────────────────────────────────────────┐
│ ImageDescriber │
│ │
│ For content-relevant images only: │
│ - OCR: extract visible text │
│ - Vision model: semantic description + alt text │
│ - Search keywords: boost text searchability │
└─────────────────────────────────────────────────────────────┘
│
▼
│
┌────┴────┐
│ │
Text Chunks CurriculumAssets
(embedded) (stored)
Content Type Taxonomy
Every chunk and asset carries a content_type:
class ContentType:
TEXT = "text" # Pure prose text
IMAGE = "image" # Visual content - must be preserved
TABLE = "table" # Structured tabular data
COMPOSITE = "composite" # Mix of text + image on same page
DECORATIVE = "decorative" # Borders, logos, page numbers - skip
AssetExtractor
class ExtractedImage(NamedTuple):
image_id: str
bytes: bytes
width: int
height: int
bbox: tuple[float, float, float, float] # x0, y0, x1, y1
page_num: int
ocr_text: str | None
extraction_method: str # "native" | "vector"
class ExtractedTable(NamedTuple):
table_id: str
markdown: str # Structured text representation
bbox: tuple[float, float, float, float]
page_num: int
row_count: int
col_count: int
class ExtractedTextBlock(NamedTuple):
text: str
bbox: tuple[float, float, float, float]
page_num: int
is_heading: bool
is_caption: bool
class AssetExtractionResult(NamedTuple):
images: list[ExtractedImage]
tables: list[ExtractedTable]
text_blocks: list[ExtractedTextBlock]
drawings: list # Vector graphics
total_pages: int
class AssetExtractor:
def __init__(self):
self.noise_phrases = {
"ar": ["المعهد التربوي الوطني", "IPN"],
"fr": ["IPN"],
"en": ["IPN"],
}
def extract(self, pdf_bytes: bytes) -> AssetExtractionResult:
doc = fitz.open_from_bytes(pdf_bytes)
images = []
tables = []
text_blocks = []
drawings = []
for page_num, page in enumerate(doc):
# Text blocks with positions
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if block["type"] == 0: # text block
text = block["text"].strip()
if text:
bbox = block["bbox"]
is_heading = self._is_heading(text)
is_caption = self._is_caption(text, blocks)
text_blocks.append(ExtractedTextBlock(
text=text,
bbox=bbox,
page_num=page_num + 1,
is_heading=is_heading,
is_caption=is_caption,
))
elif block["type"] == 1: # image block
img = self._extract_image(block, page_num + 1)
if img:
images.append(img)
# Vector drawings (graphs, diagrams)
paths = page.get_drawings()
if paths:
drawings.extend(self._extract_drawings(paths, page_num + 1))
# Tables
table_list = page.find_tables()
for table in table_list:
tbl = self._extract_table(table, page_num + 1)
if tbl:
tables.append(tbl)
return AssetExtractionResult(
images=images,
tables=tables,
text_blocks=text_blocks,
drawings=drawings,
total_pages=len(doc),
)
def _extract_image(self, block: dict, page_num: int) -> ExtractedImage | None:
"""Extract image bytes and metadata from a image block."""
# Get image xref
img_xref = block.get("xref")
if not img_xref:
return None
try:
base_image = doc.extract_image(img_xref)
return ExtractedImage(
image_id=str(img_xref),
bytes=base_image["image"],
width=base_image.get("width", 0),
height=base_image.get("height", 0),
bbox=block["bbox"],
page_num=page_num,
ocr_text=None, # Done separately by OCR pass
extraction_method="native",
)
except Exception:
return None
def _is_heading(self, text: str) -> bool:
"""Heuristic: short text, large font, all caps or Arabic title pattern."""
if len(text) > 100:
return False
# Could enhance with font size analysis from block dict
return text.isupper() or bool(re.match(r'^الدرس\s*\d+', text))
def _is_caption(self, text: str, all_blocks: list) -> bool:
"""Heuristic: text near an image block, short length."""
return len(text) < 200 and ("شكل" in text or "صورة" in text or
"Figure" in text or "ص" in text)
ContentClassifier
Determines whether an image is content-relevant or decorative.
class ContentClassifier:
DECORATIVE_SIGNALS = [
"ipn", "logo", "copyright",
]
MIN_CONTENT_AREA = 50_000 # px² - images smaller than this are likely icons
MEDIUM_CONTENT_AREA = 10_000
def is_content_image(self, img: ExtractedImage, page_context: str) -> tuple[bool, float]:
"""
Returns (is_content, confidence_score).
Confidence scoring:
- Has substantial OCR text (>10 chars): +0.8
- Large image area (>50k px²): +0.6
- Medium image area (10k-50k px²): +0.3
- Has nearby caption: +0.7
- Matches decorative pattern: -0.9
"""
score = 0.0
signals = {}
# Signal 1: OCR text content
if img.ocr_text and len(img.ocr_text.strip()) > 10:
score += 0.8
signals["has_ocr_text"] = True
# Signal 2: Image size
area = img.width * img.height
if area > self.MIN_CONTENT_AREA:
score += 0.6
signals["large"] = True
elif area > self.MEDIUM_CONTENT_AREA:
score += 0.3
signals["medium"] = True
else:
signals["small_icon"] = True
score -= 0.2
# Signal 3: Caption nearby
if page_context and self._caption_near_bbox(img.bbox, page_context):
score += 0.7
signals["has_caption"] = True
# Signal 4: Decorative pattern match
if img.ocr_text:
ocr_lower = img.ocr_text.lower()
for pattern in self.DECORATIVE_SIGNALS:
if pattern in ocr_lower:
score -= 0.9
signals["decorative_match"] = pattern
# Signal 5: Is on illustration-heavy page
# (pages with many images, few paragraphs)
is_content = score > 0.3
return is_content, max(0.0, min(1.0, score))
def _caption_near_bbox(self, bbox: tuple, context: str) -> bool:
"""Check if a caption-like text appears near the image bbox."""
caption_keywords = ["شكل", "صورة", "Figure", "fig", "انظر", "leigh"]
return any(kw in context for kw in caption_keywords)
ImageDescriber
Generates semantic descriptions for content-relevant images using a vision model.
class ImageDescription(NamedTuple):
asset_id: str
ocr_text: str # Raw OCR from image
semantic_description: str # What the image shows
alt_text: str # For frontend display
search_keywords: list[str] # Boost searchability
class ImageDescriber:
def __init__(self, openai_client: OpenAI):
self.client = openai_client
def describe(
self,
img: ExtractedImage,
chapter_context: str, # Surrounding text from the page/chapter
language: str,
) -> ImageDescription:
"""
Use GPT-4o (vision) to generate:
1. OCR of any text in the image
2. Semantic description
3. Alt text for frontend
4. Search keywords
"""
# Encode image to base64
img_b64 = base64.b64encode(img.bytes).decode()
prompt = self._build_prompt(chapter_context, language)
response = self.client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{img_b64}",
},
},
],
}
],
max_tokens=500,
)
return self._parse_response(response, img.image_id)
def _build_prompt(self, context: str, language: str) -> str:
lang_instruction = {
"ar": "Respond in Arabic where possible.",
"fr": "Respond in French.",
"en": "Respond in English.",
}.get(language, "Respond in English.")
return f"""
This image appears in an educational textbook.
Context from the surrounding page/chapter:
\"\"\"{context[:500]}\"\"\"
{lang_instruction}
Please analyze this image and provide:
1. OCR_TEXT: Any text visible in the image (transcribe exactly)
2. SEMANTIC_DESCRIPTION: What does this image show? Be specific about labels, arrows, diagram components.
3. ALT_TEXT: A short (1-2 sentence) description suitable for alt text / screen readers.
4. SEARCH_KEYWORDS: 5-10 keywords that would help a student find this image when searching.
Format your response as:
OCR_TEXT: <text>
SEMANTIC_DESCRIPTION: <description>
ALT_TEXT: <alt text>
SEARCH_KEYWORDS: <comma-separated keywords>
"""
TableExtractor
Converts PDF table structures to structured markdown.
class TableExtractor:
def extract_table(self, table, page_num: int) -> ExtractedTable:
"""Convert a PyMuPDF table to structured markdown."""
rows = table.extract()
if not rows:
return None
# Clean cells
cleaned_rows = []
for row in rows:
cleaned = [cell.strip().replace("\n", " ") for cell in row]
cleaned_rows.append(cleaned)
# Build markdown
header = cleaned_rows[0]
separator = "| " + " | ".join(["---"] * len(header)) + " |"
md_rows = ["| " + " | ".join(header) + " |"]
md_rows.append(separator)
for row in cleaned_rows[1:]:
md_rows.append("| " + " | ".join(row) + " |")
markdown = "\n".join(md_rows)
return ExtractedTable(
table_id=f"table_{page_num}_{hash(markdown) % 10000}",
markdown=markdown,
bbox=table.bbox,
page_num=page_num,
row_count=len(cleaned_rows),
col_count=len(header) if header else 0,
)
Database Schema
CREATE TABLE curriculum_assets (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-- Link to chapter
chapter_split_id UUID REFERENCES chapter_splits(id),
document_id UUID REFERENCES documents(id),
-- Asset metadata
asset_type TEXT NOT NULL, -- "image" | "table" | "drawing"
content_type TEXT NOT NULL, -- "decorative" | "content" | "composite"
-- Image data
image_bytes BYTEA,
image_mime_type TEXT DEFAULT 'image/png',
image_width INTEGER,
image_height INTEGER,
-- Text representation
ocr_text TEXT,
semantic_description TEXT,
alt_text TEXT,
search_text TEXT,
-- Position
page_number INTEGER NOT NULL,
bounding_box JSONB, -- {"x0": 100, "y0": 200, "x1": 400, "y1": 600}
-- Linkage
linked_chunk_ids UUID[],
-- Quality signals
confidence FLOAT DEFAULT 1.0,
needs_review BOOLEAN DEFAULT FALSE,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- Extend chunks table
ALTER TABLE chunks ADD COLUMN content_type TEXT DEFAULT 'text';
ALTER TABLE chunks ADD COLUMN is_visual_reference BOOLEAN DEFAULT FALSE;
ALTER TABLE chunks ADD COLUMN linked_asset_ids UUID[];
ALTER TABLE chunks ADD COLUMN page_coordinates JSONB;
Integration with Ingestion Pipeline
ChapterSplitter.split()
│
▼
┌─────────────────────────────────────────────────────────────┐
│ AssetExtractor.extract() │
│ Returns: images, tables, text_blocks │
└─────────────────────────────────────────────────────────────┘
│
├──► ContentClassifier.is_content_image()
│ │
│ ┌────┴────┐
│ │ │
│ Content Decorative
│ │ │
│ ▼ ▼
│ ImageDescriber → Skip or low-priority store
│ │
│ ▼
│ Store in curriculum_assets
│
├──► TableExtractor
│ │
│ ▼
│ Convert to markdown
│ Treat as text chunk
│
└──► Text blocks
│
▼
Clean + Chunk normally
│
▼
Embed + store with linked_asset_ids
Retrieval: Returning Images with Text
Update RetrievalResult to include linked assets:
@dataclass
class RetrievalResult:
chunk_id: str
text: str
content_type: str
score: float
# Visual content
linked_assets: list[dict] | None
# [
# {
# "asset_id": "...",
# "asset_type": "image",
# "image_url": "...", # Presigned URL or base64
# "alt_text": "...",
# "semantic_description": "...",
# "ocr_text": "...",
# }
# ]
chapter_title: str
lesson_title: str
page_range: str
Visual Query Boosting
When a query contains visual keywords, boost chunks with linked image assets:
VISUAL_KEYWORDS = {
"en": ["picture", "photo", "graph", "diagram", "figure", "image", "show"],
"fr": ["image", "photo", "graphique", "diagramme", "figure"],
"ar": ["صورة", "رسم", "شكل", "صورة", "graphique"],
}
def is_visual_query(query: str) -> bool:
query_lower = query.lower()
all_keywords = [kw for kws in VISUAL_KEYWORDS.values() for kw in kws]
return any(kw in query_lower for kw in all_keywords)
# In RetrievalPipeline.retrieve():
if is_visual_query(query):
for chunk in results:
if chunk.get("linked_asset_ids"):
chunk["score"] *= 1.2 # Boost visual content
Implementation Location
app/
services/
chapter_splitter.py # From chapter-splitting.md
asset_extractor.py # NEW: AssetExtractor + ContentClassifier
image_describer.py # NEW: ImageDescriber (uses OpenAI GPT-4o vision)
table_extractor.py # NEW: TableExtractor
External Dependencies
- PyMuPDF (
fitz) — already in use for PDF processing - OpenAI GPT-4o — for vision-based image description (existing OpenAI client can be reused)
- No new infrastructure dependencies