LeapOCRLeapOCR Docs

Custom Schemas

Define extraction schemas for structured data extraction

Custom Schemas

Custom schemas allow you to define exactly what data to extract from your documents. LeapOCR uses JSON Schema format to understand your extraction requirements.

Basic Schema Structure

Schemas define the structure and types of data you want to extract.

Simple Schema

{
  "invoice_number": "string",
  "total_amount": "number",
  "invoice_date": "string"
}
import * as z from 'zod';

const InvoiceSchema = z.object({
  invoice_number: z.string(),
  total_amount: z.number(),
  invoice_date: z.string(),
});

type Invoice = z.infer<typeof InvoiceSchema>;

// Use with LeapOCR
const result = await client.ocr.processURL(url, {
  format: "structured",
  schema: {
    invoice_number: "string",
    total_amount: "number",
    invoice_date: "string",
  }
});
from pydantic import BaseModel

class Invoice(BaseModel):
    invoice_number: str
    total_amount: float
    invoice_date: str

# Use with LeapOCR
result = await client.ocr.process_url(
    url,
    options=ProcessOptions(
        format=Format.STRUCTURED,
        schema={
            "invoice_number": "string",
            "total_amount": "number",
            "invoice_date": "string",
        }
    )
)
# Parse result with Pydantic
invoice = Invoice(**result.pages[0].result)
type Invoice struct {
    InvoiceNumber string  `json:"invoice_number"`
    TotalAmount   float64 `json:"total_amount"`
    InvoiceDate   string  `json:"invoice_date"`
}

// Use with LeapOCR
schema := map[string]interface{}{
    "invoice_number": "string",
    "total_amount":   "number",
    "invoice_date":   "string",
}

job, err := client.ProcessURL(ctx, url,
    ocr.WithFormat(ocr.FormatStructured),
    ocr.WithSchema(schema),
)

Full JSON Schema

{
  "type": "object",
  "properties": {
    "invoice_number": {
      "type": "string",
      "description": "The unique invoice identifier"
    },
    "total_amount": {
      "type": "number",
      "description": "Total invoice amount in dollars"
    },
    "invoice_date": {
      "type": "string",
      "description": "Invoice date in ISO format"
    }
  },
  "required": ["invoice_number", "total_amount"]
}
import * as z from 'zod';

const InvoiceSchema = z.object({
  invoice_number: z.string().describe('The unique invoice identifier'),
  total_amount: z.number().describe('Total invoice amount in dollars'),
  invoice_date: z.string().describe('Invoice date in ISO format'),
});

// Generate JSON Schema from Zod
const jsonSchema = z.toJSONSchema(InvoiceSchema);
// => {
//   type: 'object',
//   properties: {
//     invoice_number: { type: 'string', description: '...' },
//     total_amount: { type: 'number', description: '...' },
//     invoice_date: { type: 'string', description: '...' }
//   },
//   required: ['invoice_number', 'total_amount', 'invoice_date'],
//   additionalProperties: false
// }

// Use with LeapOCR
const result = await client.ocr.processURL(url, {
  format: "structured",
  schema: {
    invoice_number: "string",
    total_amount: "number",
    invoice_date: "string",
  }
});
import json
from pydantic import BaseModel, Field

class Invoice(BaseModel):
    invoice_number: str = Field(
        description="The unique invoice identifier"
    )
    total_amount: float = Field(
        description="Total invoice amount in dollars"
    )
    invoice_date: str = Field(
        description="Invoice date in ISO format"
    )

# Generate JSON Schema from Pydantic
json_schema = Invoice.model_json_schema()
print(json.dumps(json_schema, indent=2))
# => {
#   "type": "object",
#   "properties": {
#     "invoice_number": {"type": "string", "description": "..."},
#     "total_amount": {"type": "number", "description": "..."},
#     "invoice_date": {"type": "string", "description": "..."}
#   },
#   "required": ["invoice_number", "total_amount", "invoice_date"]
# }

# Use with LeapOCR
result = await client.ocr.process_url(
    url,
    options=ProcessOptions(
        format=Format.STRUCTURED,
        schema={
            "invoice_number": "string",
            "total_amount": "number",
            "invoice_date": "string",
        }
    )
)
import "github.com/invopop/jsonschema"

type Invoice struct {
    InvoiceNumber string  `json:"invoice_number" jsonschema:"description=The unique invoice identifier"`
    TotalAmount   float64 `json:"total_amount" jsonschema:"description=Total invoice amount in dollars"`
    InvoiceDate   string  `json:"invoice_date" jsonschema:"description=Invoice date in ISO format"`
}

// Generate JSON Schema from Go struct
reflectedSchema := jsonschema.Reflect(&Invoice{})
// => {
//   "$schema": "https://json-schema.org/draft/2020-12/schema",
//   "properties": {
//     "invoice_number": {"type": "string", "description": "..."},
//     "total_amount": {"type": "number", "description": "..."},
//     "invoice_date": {"type": "string", "description": "..."}
//   },
//   "required": ["invoice_number", "total_amount", "invoice_date"],
//   "type": "object"
// }

// For LeapOCR, use simplified format
schema := map[string]interface{}{
    "invoice_number": "string",
    "total_amount":   "number",
    "invoice_date":   "string",
}

Nested Objects

Extract complex, nested data structures.

{
  "type": "object",
  "properties": {
    "customer": {
      "type": "object",
      "properties": {
        "name": { "type": "string" },
        "email": { "type": "string" },
        "address": {
          "type": "object",
          "properties": {
            "street": { "type": "string" },
            "city": { "type": "string" },
            "zip": { "type": "string" }
          }
        }
      }
    }
  }
}
import * as z from 'zod';

const AddressSchema = z.object({
  street: z.string(),
  city: z.string(),
  zip: z.string(),
});

const CustomerSchema = z.object({
  name: z.string(),
  email: z.string().email(),
  address: AddressSchema,
});

const OrderSchema = z.object({
  customer: CustomerSchema,
});

type Order = z.infer<typeof OrderSchema>;

// Use with LeapOCR
const result = await client.ocr.processURL(url, {
  format: "structured",
  schema: {
    customer: {
      name: "string",
      email: "string",
      address: {
        street: "string",
        city: "string",
        zip: "string",
      }
    }
  }
});
from pydantic import BaseModel, EmailStr

class Address(BaseModel):
    street: str
    city: str
    zip: str

class Customer(BaseModel):
    name: str
    email: EmailStr
    address: Address

class Order(BaseModel):
    customer: Customer

# Use with LeapOCR
result = await client.ocr.process_url(
    url,
    options=ProcessOptions(
        format=Format.STRUCTURED,
        schema={
            "customer": {
                "name": "string",
                "email": "string",
                "address": {
                    "street": "string",
                    "city": "string",
                    "zip": "string",
                }
            }
        }
    )
)
# Parse with Pydantic
order = Order(**result.pages[0].result)
type Address struct {
    Street string `json:"street"`
    City   string `json:"city"`
    Zip    string `json:"zip"`
}

type Customer struct {
    Name    string  `json:"name"`
    Email   string  `json:"email"`
    Address Address `json:"address"`
}

type Order struct {
    Customer Customer `json:"customer"`
}

// Create schema for LeapOCR
schema := map[string]interface{}{
    "customer": map[string]interface{}{
        "name":  "string",
        "email": "string",
        "address": map[string]interface{}{
            "street": "string",
            "city":   "string",
            "zip":    "string",
        },
    },
}

Arrays

Extract lists and repeating data.

{
  "type": "object",
  "properties": {
    "line_items": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "description": { "type": "string" },
          "quantity": { "type": "number" },
          "price": { "type": "number" },
          "total": { "type": "number" }
        }
      }
    }
  }
}
import * as z from 'zod';

const LineItemSchema = z.object({
  description: z.string(),
  quantity: z.number(),
  price: z.number(),
  total: z.number(),
});

const InvoiceSchema = z.object({
  line_items: z.array(LineItemSchema),
});

type Invoice = z.infer<typeof InvoiceSchema>;

// Use with LeapOCR
const result = await client.ocr.processURL(url, {
  format: "structured",
  schema: {
    line_items: [{
      description: "string",
      quantity: "number",
      price: "number",
      total: "number",
    }]
  }
});
from pydantic import BaseModel

class LineItem(BaseModel):
    description: str
    quantity: float
    price: float
    total: float

class Invoice(BaseModel):
    line_items: list[LineItem]

# Use with LeapOCR
result = await client.ocr.process_url(
    url,
    options=ProcessOptions(
        format=Format.STRUCTURED,
        schema={
            "line_items": [{
                "description": "string",
                "quantity": "number",
                "price": "number",
                "total": "number",
            }]
        }
    )
)
# Parse with Pydantic
invoice = Invoice(**result.pages[0].result)
type LineItem struct {
    Description string  `json:"description"`
    Quantity    float64 `json:"quantity"`
    Price       float64 `json:"price"`
    Total       float64 `json:"total"`
}

type Invoice struct {
    LineItems []LineItem `json:"line_items"`
}

// Create schema for LeapOCR
schema := map[string]interface{}{
    "line_items": []map[string]interface{}{{
        "description": "string",
        "quantity":    "number",
        "price":       "number",
        "total":       "number",
    }},
}

Supported Types

TypeDescriptionExample
stringText data"John Doe"
numberNumeric values1234.56
integerWhole numbers42
booleanTrue/false valuestrue
arrayLists of items["item1", "item2"]
objectNested structures{"key": "value"}
nullEmpty/missing valuesnull

Best Practices

1. Be Specific with Descriptions

{
  "properties": {
    "total_amount": {
      "type": "number",
      "description": "Total invoice amount in USD, excluding tax"
    }
  }
}

2. Use Required Fields

{
  "required": ["invoice_number", "date", "total"],
  "properties": {
    "invoice_number": { "type": "string" },
    "date": { "type": "string" },
    "total": { "type": "number" }
  }
}

3. Provide Examples in Descriptions

{
  "properties": {
    "date": {
      "type": "string",
      "description": "Invoice date in YYYY-MM-DD format, e.g., 2024-01-15"
    }
  }
}

4. Keep Schemas Focused

Don't try to extract everything at once. Focus on the most important fields for your use case.

Real-World Examples

Medical Record

{
  "type": "object",
  "properties": {
    "patient_name": { "type": "string" },
    "date_of_birth": { "type": "string" },
    "visit_date": { "type": "string" },
    "diagnosis": { "type": "string" },
    "medications": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": { "type": "string" },
          "dosage": { "type": "string" },
          "frequency": { "type": "string" }
        }
      }
    }
  }
}
import * as z from 'zod';

const MedicationSchema = z.object({
  name: z.string(),
  dosage: z.string(),
  frequency: z.string(),
});

const MedicalRecordSchema = z.object({
  patient_name: z.string(),
  date_of_birth: z.string(),
  visit_date: z.string(),
  diagnosis: z.string(),
  medications: z.array(MedicationSchema),
});

type MedicalRecord = z.infer<typeof MedicalRecordSchema>;
from pydantic import BaseModel
from datetime import date

class Medication(BaseModel):
    name: str
    dosage: str
    frequency: str

class MedicalRecord(BaseModel):
    patient_name: str
    date_of_birth: str
    visit_date: str
    diagnosis: str
    medications: list[Medication]
type Medication struct {
    Name      string `json:"name"`
    Dosage    string `json:"dosage"`
    Frequency string `json:"frequency"`
}

type MedicalRecord struct {
    PatientName   string       `json:"patient_name"`
    DateOfBirth   string       `json:"date_of_birth"`
    VisitDate     string       `json:"visit_date"`
    Diagnosis     string       `json:"diagnosis"`
    Medications   []Medication `json:"medications"`
}

Receipt

{
  "type": "object",
  "properties": {
    "merchant_name": { "type": "string" },
    "date": { "type": "string" },
    "total": { "type": "number" },
    "items": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": { "type": "string" },
          "price": { "type": "number" }
        }
      }
    }
  }
}

Contract

{
  "type": "object",
  "properties": {
    "contract_number": { "type": "string" },
    "effective_date": { "type": "string" },
    "expiration_date": { "type": "string" },
    "parties": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": { "type": "string" },
          "role": { "type": "string" }
        }
      }
    },
    "terms": { "type": "string" }
  }
}

Schema vs Instructions vs Templates

You can specify extraction requirements in three ways:

  • Schema: Structured data extraction with defined types
  • Instructions: Natural language description (e.g., "Extract the invoice total and date")
  • Template: Pre-defined document type with standard fields

Note: Only one can be used per request. Choose based on your needs:

  • Use schema for complex, structured extraction
  • Use instructions for simple, ad-hoc extraction