Custom Schemas
Define extraction schemas for structured data extraction
Custom Schemas
Custom schemas allow you to define exactly what data to extract from your documents. LeapOCR uses JSON Schema format to understand your extraction requirements.
Basic Schema Structure
Schemas define the structure and types of data you want to extract.
Simple Schema
{
"invoice_number": "string",
"total_amount": "number",
"invoice_date": "string"
}import * as z from 'zod';
const InvoiceSchema = z.object({
invoice_number: z.string(),
total_amount: z.number(),
invoice_date: z.string(),
});
type Invoice = z.infer<typeof InvoiceSchema>;
// Use with LeapOCR
const result = await client.ocr.processURL(url, {
format: "structured",
schema: {
invoice_number: "string",
total_amount: "number",
invoice_date: "string",
}
});from pydantic import BaseModel
class Invoice(BaseModel):
invoice_number: str
total_amount: float
invoice_date: str
# Use with LeapOCR
result = await client.ocr.process_url(
url,
options=ProcessOptions(
format=Format.STRUCTURED,
schema={
"invoice_number": "string",
"total_amount": "number",
"invoice_date": "string",
}
)
)
# Parse result with Pydantic
invoice = Invoice(**result.pages[0].result)type Invoice struct {
InvoiceNumber string `json:"invoice_number"`
TotalAmount float64 `json:"total_amount"`
InvoiceDate string `json:"invoice_date"`
}
// Use with LeapOCR
schema := map[string]interface{}{
"invoice_number": "string",
"total_amount": "number",
"invoice_date": "string",
}
job, err := client.ProcessURL(ctx, url,
ocr.WithFormat(ocr.FormatStructured),
ocr.WithSchema(schema),
)Full JSON Schema
{
"type": "object",
"properties": {
"invoice_number": {
"type": "string",
"description": "The unique invoice identifier"
},
"total_amount": {
"type": "number",
"description": "Total invoice amount in dollars"
},
"invoice_date": {
"type": "string",
"description": "Invoice date in ISO format"
}
},
"required": ["invoice_number", "total_amount"]
}import * as z from 'zod';
const InvoiceSchema = z.object({
invoice_number: z.string().describe('The unique invoice identifier'),
total_amount: z.number().describe('Total invoice amount in dollars'),
invoice_date: z.string().describe('Invoice date in ISO format'),
});
// Generate JSON Schema from Zod
const jsonSchema = z.toJSONSchema(InvoiceSchema);
// => {
// type: 'object',
// properties: {
// invoice_number: { type: 'string', description: '...' },
// total_amount: { type: 'number', description: '...' },
// invoice_date: { type: 'string', description: '...' }
// },
// required: ['invoice_number', 'total_amount', 'invoice_date'],
// additionalProperties: false
// }
// Use with LeapOCR
const result = await client.ocr.processURL(url, {
format: "structured",
schema: {
invoice_number: "string",
total_amount: "number",
invoice_date: "string",
}
});import json
from pydantic import BaseModel, Field
class Invoice(BaseModel):
invoice_number: str = Field(
description="The unique invoice identifier"
)
total_amount: float = Field(
description="Total invoice amount in dollars"
)
invoice_date: str = Field(
description="Invoice date in ISO format"
)
# Generate JSON Schema from Pydantic
json_schema = Invoice.model_json_schema()
print(json.dumps(json_schema, indent=2))
# => {
# "type": "object",
# "properties": {
# "invoice_number": {"type": "string", "description": "..."},
# "total_amount": {"type": "number", "description": "..."},
# "invoice_date": {"type": "string", "description": "..."}
# },
# "required": ["invoice_number", "total_amount", "invoice_date"]
# }
# Use with LeapOCR
result = await client.ocr.process_url(
url,
options=ProcessOptions(
format=Format.STRUCTURED,
schema={
"invoice_number": "string",
"total_amount": "number",
"invoice_date": "string",
}
)
)import "github.com/invopop/jsonschema"
type Invoice struct {
InvoiceNumber string `json:"invoice_number" jsonschema:"description=The unique invoice identifier"`
TotalAmount float64 `json:"total_amount" jsonschema:"description=Total invoice amount in dollars"`
InvoiceDate string `json:"invoice_date" jsonschema:"description=Invoice date in ISO format"`
}
// Generate JSON Schema from Go struct
reflectedSchema := jsonschema.Reflect(&Invoice{})
// => {
// "$schema": "https://json-schema.org/draft/2020-12/schema",
// "properties": {
// "invoice_number": {"type": "string", "description": "..."},
// "total_amount": {"type": "number", "description": "..."},
// "invoice_date": {"type": "string", "description": "..."}
// },
// "required": ["invoice_number", "total_amount", "invoice_date"],
// "type": "object"
// }
// For LeapOCR, use simplified format
schema := map[string]interface{}{
"invoice_number": "string",
"total_amount": "number",
"invoice_date": "string",
}Nested Objects
Extract complex, nested data structures.
{
"type": "object",
"properties": {
"customer": {
"type": "object",
"properties": {
"name": { "type": "string" },
"email": { "type": "string" },
"address": {
"type": "object",
"properties": {
"street": { "type": "string" },
"city": { "type": "string" },
"zip": { "type": "string" }
}
}
}
}
}
}import * as z from 'zod';
const AddressSchema = z.object({
street: z.string(),
city: z.string(),
zip: z.string(),
});
const CustomerSchema = z.object({
name: z.string(),
email: z.string().email(),
address: AddressSchema,
});
const OrderSchema = z.object({
customer: CustomerSchema,
});
type Order = z.infer<typeof OrderSchema>;
// Use with LeapOCR
const result = await client.ocr.processURL(url, {
format: "structured",
schema: {
customer: {
name: "string",
email: "string",
address: {
street: "string",
city: "string",
zip: "string",
}
}
}
});from pydantic import BaseModel, EmailStr
class Address(BaseModel):
street: str
city: str
zip: str
class Customer(BaseModel):
name: str
email: EmailStr
address: Address
class Order(BaseModel):
customer: Customer
# Use with LeapOCR
result = await client.ocr.process_url(
url,
options=ProcessOptions(
format=Format.STRUCTURED,
schema={
"customer": {
"name": "string",
"email": "string",
"address": {
"street": "string",
"city": "string",
"zip": "string",
}
}
}
)
)
# Parse with Pydantic
order = Order(**result.pages[0].result)type Address struct {
Street string `json:"street"`
City string `json:"city"`
Zip string `json:"zip"`
}
type Customer struct {
Name string `json:"name"`
Email string `json:"email"`
Address Address `json:"address"`
}
type Order struct {
Customer Customer `json:"customer"`
}
// Create schema for LeapOCR
schema := map[string]interface{}{
"customer": map[string]interface{}{
"name": "string",
"email": "string",
"address": map[string]interface{}{
"street": "string",
"city": "string",
"zip": "string",
},
},
}Arrays
Extract lists and repeating data.
{
"type": "object",
"properties": {
"line_items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"description": { "type": "string" },
"quantity": { "type": "number" },
"price": { "type": "number" },
"total": { "type": "number" }
}
}
}
}
}import * as z from 'zod';
const LineItemSchema = z.object({
description: z.string(),
quantity: z.number(),
price: z.number(),
total: z.number(),
});
const InvoiceSchema = z.object({
line_items: z.array(LineItemSchema),
});
type Invoice = z.infer<typeof InvoiceSchema>;
// Use with LeapOCR
const result = await client.ocr.processURL(url, {
format: "structured",
schema: {
line_items: [{
description: "string",
quantity: "number",
price: "number",
total: "number",
}]
}
});from pydantic import BaseModel
class LineItem(BaseModel):
description: str
quantity: float
price: float
total: float
class Invoice(BaseModel):
line_items: list[LineItem]
# Use with LeapOCR
result = await client.ocr.process_url(
url,
options=ProcessOptions(
format=Format.STRUCTURED,
schema={
"line_items": [{
"description": "string",
"quantity": "number",
"price": "number",
"total": "number",
}]
}
)
)
# Parse with Pydantic
invoice = Invoice(**result.pages[0].result)type LineItem struct {
Description string `json:"description"`
Quantity float64 `json:"quantity"`
Price float64 `json:"price"`
Total float64 `json:"total"`
}
type Invoice struct {
LineItems []LineItem `json:"line_items"`
}
// Create schema for LeapOCR
schema := map[string]interface{}{
"line_items": []map[string]interface{}{{
"description": "string",
"quantity": "number",
"price": "number",
"total": "number",
}},
}Supported Types
| Type | Description | Example |
|---|---|---|
string | Text data | "John Doe" |
number | Numeric values | 1234.56 |
integer | Whole numbers | 42 |
boolean | True/false values | true |
array | Lists of items | ["item1", "item2"] |
object | Nested structures | {"key": "value"} |
null | Empty/missing values | null |
Best Practices
1. Be Specific with Descriptions
{
"properties": {
"total_amount": {
"type": "number",
"description": "Total invoice amount in USD, excluding tax"
}
}
}2. Use Required Fields
{
"required": ["invoice_number", "date", "total"],
"properties": {
"invoice_number": { "type": "string" },
"date": { "type": "string" },
"total": { "type": "number" }
}
}3. Provide Examples in Descriptions
{
"properties": {
"date": {
"type": "string",
"description": "Invoice date in YYYY-MM-DD format, e.g., 2024-01-15"
}
}
}4. Keep Schemas Focused
Don't try to extract everything at once. Focus on the most important fields for your use case.
Real-World Examples
Medical Record
{
"type": "object",
"properties": {
"patient_name": { "type": "string" },
"date_of_birth": { "type": "string" },
"visit_date": { "type": "string" },
"diagnosis": { "type": "string" },
"medications": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": { "type": "string" },
"dosage": { "type": "string" },
"frequency": { "type": "string" }
}
}
}
}
}import * as z from 'zod';
const MedicationSchema = z.object({
name: z.string(),
dosage: z.string(),
frequency: z.string(),
});
const MedicalRecordSchema = z.object({
patient_name: z.string(),
date_of_birth: z.string(),
visit_date: z.string(),
diagnosis: z.string(),
medications: z.array(MedicationSchema),
});
type MedicalRecord = z.infer<typeof MedicalRecordSchema>;from pydantic import BaseModel
from datetime import date
class Medication(BaseModel):
name: str
dosage: str
frequency: str
class MedicalRecord(BaseModel):
patient_name: str
date_of_birth: str
visit_date: str
diagnosis: str
medications: list[Medication]type Medication struct {
Name string `json:"name"`
Dosage string `json:"dosage"`
Frequency string `json:"frequency"`
}
type MedicalRecord struct {
PatientName string `json:"patient_name"`
DateOfBirth string `json:"date_of_birth"`
VisitDate string `json:"visit_date"`
Diagnosis string `json:"diagnosis"`
Medications []Medication `json:"medications"`
}Receipt
{
"type": "object",
"properties": {
"merchant_name": { "type": "string" },
"date": { "type": "string" },
"total": { "type": "number" },
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": { "type": "string" },
"price": { "type": "number" }
}
}
}
}
}Contract
{
"type": "object",
"properties": {
"contract_number": { "type": "string" },
"effective_date": { "type": "string" },
"expiration_date": { "type": "string" },
"parties": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": { "type": "string" },
"role": { "type": "string" }
}
}
},
"terms": { "type": "string" }
}
}Schema vs Instructions vs Templates
You can specify extraction requirements in three ways:
- Schema: Structured data extraction with defined types
- Instructions: Natural language description (e.g., "Extract the invoice total and date")
- Template: Pre-defined document type with standard fields
Note: Only one can be used per request. Choose based on your needs:
- Use schema for complex, structured extraction
- Use instructions for simple, ad-hoc extraction