Ocr Processing

Image-to-text with various engines (Tesseract, EasyOCR, cloud APIs), PDF text extraction, table recognition, and layout analysis

OCR Processing Skill

Extract text from images and PDFs using multiple OCR engines, recognize tables, and analyze document layouts.

OCR Processing Skill

Extract text from images and PDFs using multiple OCR engines, recognize tables, and analyze document layouts.

Process

Step 1: Basic Tesseract OCR

python

import pytesseract
from PIL import Image
import io

def extract_text_tesseract(image_path: str, lang: str = "eng") -> str:
    """Extract text using Tesseract OCR.
    
    Args:
        image_path: Path to image file
        lang: Language code (e.g., 'eng', 'spa', 'fra')
    """
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image, lang=lang)
    return text.strip()

# With image preprocessing
def extract_text_preprocessed(image_path: str) -> str:
    """Extract text with image preprocessing for better accuracy."""
    import cv2
    import numpy as np
    
    # Read image
    img = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply thresholding
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Denoise
    denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
    
    # Convert to PIL Image
    pil_image = Image.fromarray(denoised)
    
    # Extract text
    text = pytesseract.image_to_string(pil_image)
    return text.strip()

# Usage
text = extract_text_tesseract("document.png")

Step 2: EasyOCR for Better Accuracy

python

import easyocr

def extract_text_easyocr(image_path: str, languages: list = ["en"]) -> str:
    """Extract text using EasyOCR (often more accurate than Tesseract).
    
    Args:
        image_path: Path to image file
        languages: List of language codes (e.g., ['en', 'es'])
    """
    reader = easyocr.Reader(languages, gpu=False)
    results = reader.readtext(image_path)
    
    # Combine all detected text
    text = "\n".join([result[1] for result in results])
    return text

# With confidence filtering
def extract_text_with_confidence(image_path: str, min_confidence: float = 0.5) -> list:
    """Extract text with confidence scores and bounding boxes."""
    reader = easyocr.Reader(["en"], gpu=False)
    results = reader.readtext(image_path)
    
    filtered = [
        {
            "text": result[1],
            "confidence": result[2],
            "bbox": result[0]
        }
        for result in results
        if result[2] >= min_confidence
    ]
    return filtered

# Usage
text = extract_text_easyocr("document.png")
detections = extract_text_with_confidence("document.png", min_confidence=0.7)

Step 3: PDF Text Extraction

python

from pdf2image import convert_from_path
import pytesseract
from pymupdf import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path: str, method: str = "native") -> dict:
    """Extract text from PDF using different methods.
    
    Args:
        pdf_path: Path to PDF file
        method: 'native' (text layer) or 'ocr' (image-based)
    """
    if method == "native":
        # Extract from text layer (faster, but only works if PDF has text)
        doc = fitz.open(pdf_path)
        pages_text = []
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            pages_text.append({
                "page": page_num + 1,
                "text": text
            })
        
        doc.close()
        return {"pages": pages_text}
    
    else:  # OCR method
        # Convert PDF pages to images
        images = convert_from_path(pdf_path, dpi=300)
        pages_text = []
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            pages_text.append({
                "page": i + 1,
                "text": text
            })
        
        return {"pages": pages_text}

# Hybrid approach: try native first, fallback to OCR
def extract_pdf_hybrid(pdf_path: str) -> dict:
    """Try native extraction first, use OCR if text is sparse."""
    doc = fitz.open(pdf_path)
    pages_text = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        native_text = page.get_text()
        
        # If text is too short, likely scanned image
        if len(native_text.strip()) < 100:
            # Convert page to image and OCR
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text = pytesseract.image_to_string(img)
            pages_text.append({
                "page": page_num + 1,
                "text": ocr_text,
                "method": "ocr"
            })
        else:
            pages_text.append({
                "page": page_num + 1,
                "text": native_text,
                "method": "native"
            })
    
    doc.close()
    return {"pages": pages_text}

Step 4: Table Recognition and Extraction

python

import cv2
import numpy as np
from PIL import Image
import pytesseract

def detect_tables(image_path: str) -> list:
    """Detect table boundaries in an image."""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Detect horizontal lines
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
    detected_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
    cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    
    # Detect vertical lines
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
    detected_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts2 = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts2 = cnts2[0] if len(cnts2) == 2 else cnts2[1]
    
    # Combine contours
    all_cnts = list(cnts) + list(cnts2)
    
    # Find bounding boxes
    tables = []
    for cnt in all_cnts:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 100 and h > 50:  # Filter small detections
            tables.append({"x": x, "y": y, "width": w, "height": h})
    
    return tables

def extract_table_cells(image_path: str, table_bbox: dict) -> list[list[str]]:
    """Extract text from table cells."""
    img = cv2.imread(image_path)
    
    # Crop table region
    x, y, w, h = table_bbox["x"], table_bbox["y"], table_bbox["width"], table_bbox["height"]
    table_img = img[y:y+h, x:x+w]
    
    # Convert to PIL
    pil_img = Image.fromarray(cv2.cvtColor(table_img, cv2.COLOR_BGR2RGB))
    
    # Use Tesseract with table structure detection
    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(pil_img, config=custom_config)
    
    # Parse into rows (simple approach)
    rows = []
    for line in text.strip().split('\n'):
        if line.strip():
            # Split by multiple spaces (assuming tabular format)
            cells = [cell.strip() for cell in line.split('  ') if cell.strip()]
            if cells:
                rows.append(cells)
    
    return rows

# Using unstructured library for better table extraction
def extract_tables_unstructured(pdf_path: str) -> list:
    """Extract tables using unstructured library."""
    from unstructured.partition.pdf import partition_pdf
    
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True
    )
    
    tables = []
    for element in elements:
        if hasattr(element, "metadata") and element.metadata.text_as_html:
            tables.append({
                "text": element.text,
                "html": element.metadata.text_as_html,
                "type": "table"
            })
    
    return tables

Step 5: Layout Analysis

python

from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

def analyze_document_layout(pdf_path: str) -> dict:
    """Analyze document structure and layout."""
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True,
        include_page_breaks=True
    )
    
    layout = {
        "pages": [],
        "tables": [],
        "titles": [],
        "paragraphs": []
    }
    
    current_page = 1
    for element in elements:
        element_type = type(element).__name__
        
        if "PageBreak" in element_type:
            current_page += 1
            continue
        
        element_data = {
            "page": current_page,
            "type": element_type,
            "text": element.text[:200] if element.text else "",
        }
        
        if element_type == "Table":
            layout["tables"].append(element_data)
            if hasattr(element.metadata, "text_as_html"):
                element_data["html"] = element.metadata.text_as_html
        elif element_type == "Title":
            layout["titles"].append(element_data)
        elif element_type == "NarrativeText":
            layout["paragraphs"].append(element_data)
    
    return layout

def chunk_by_sections(pdf_path: str) -> list:
    """Chunk document by sections using title detection."""
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True
    )
    
    chunks = chunk_by_title(
        elements,
        max_characters=2000,
        combine_text_under_n_chars=500
    )
    
    return [
        {
            "text": chunk.text,
            "metadata": chunk.metadata.to_dict() if hasattr(chunk, "metadata") else {}
        }
        for chunk in chunks
    ]

Step 6: Cloud OCR APIs Integration

python

from google.cloud import vision
import boto3
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials

def extract_text_google_vision(image_path: str, credentials_path: str) -> str:
    """Extract text using Google Cloud Vision API."""
    client = vision.ImageAnnotatorClient.from_service_account_file(credentials_path)
    
    with open(image_path, "rb") as image_file:
        content = image_file.read()
    
    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    
    if texts:
        return texts[0].description
    return ""

def extract_text_aws_textract(image_path: str, aws_region: str = "us-east-1") -> dict:
    """Extract text using AWS Textract."""
    textract = boto3.client("textract", region_name=aws_region)
    
    with open(image_path, "rb") as document:
        response = textract.detect_document_text(
            Document={"Bytes": document.read()}
        )
    
    blocks = response["Blocks"]
    text_blocks = [
        block["Text"]
        for block in blocks
        if block["BlockType"] == "LINE"
    ]
    
    return {
        "text": "\n".join(text_blocks),
        "blocks": blocks
    }

def extract_text_azure_vision(image_path: str, endpoint: str, key: str) -> str:
    """Extract text using Azure Computer Vision."""
    client = ComputerVisionClient(
        endpoint=endpoint,
        credentials=CognitiveServicesCredentials(key)
    )
    
    with open(image_path, "rb") as image_stream:
        ocr_result = client.read_in_stream(image_stream, raw=True)
    
    # Get operation ID
    operation_id = ocr_result.headers["Operation-Location"].split("/")[-1]
    
    # Wait for result
    import time
    while True:
        result = client.get_read_result(operation_id)
        if result.status not in ["notStarted", "running"]:
            break
        time.sleep(1)
    
    # Extract text
    text_lines = []
    if result.status == "succeeded":
        for page in result.analyze_result.read_results:
            for line in page.lines:
                text_lines.append(line.text)
    
    return "\n".join(text_lines)

python

import pytesseract
from PIL import Image
import io

def extract_text_tesseract(image_path: str, lang: str = "eng") -> str:
    """Extract text using Tesseract OCR.
    
    Args:
        image_path: Path to image file
        lang: Language code (e.g., 'eng', 'spa', 'fra')
    """
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image, lang=lang)
    return text.strip()

# With image preprocessing
def extract_text_preprocessed(image_path: str) -> str:
    """Extract text with image preprocessing for better accuracy."""
    import cv2
    import numpy as np
    
    # Read image
    img = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply thresholding
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Denoise
    denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
    
    # Convert to PIL Image
    pil_image = Image.fromarray(denoised)
    
    # Extract text
    text = pytesseract.image_to_string(pil_image)
    return text.strip()

# Usage
text = extract_text_tesseract("document.png")

python

import easyocr

def extract_text_easyocr(image_path: str, languages: list = ["en"]) -> str:
    """Extract text using EasyOCR (often more accurate than Tesseract).
    
    Args:
        image_path: Path to image file
        languages: List of language codes (e.g., ['en', 'es'])
    """
    reader = easyocr.Reader(languages, gpu=False)
    results = reader.readtext(image_path)
    
    # Combine all detected text
    text = "\n".join([result[1] for result in results])
    return text

# With confidence filtering
def extract_text_with_confidence(image_path: str, min_confidence: float = 0.5) -> list:
    """Extract text with confidence scores and bounding boxes."""
    reader = easyocr.Reader(["en"], gpu=False)
    results = reader.readtext(image_path)
    
    filtered = [
        {
            "text": result[1],
            "confidence": result[2],
            "bbox": result[0]
        }
        for result in results
        if result[2] >= min_confidence
    ]
    return filtered

# Usage
text = extract_text_easyocr("document.png")
detections = extract_text_with_confidence("document.png", min_confidence=0.7)

python

from pdf2image import convert_from_path
import pytesseract
from pymupdf import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path: str, method: str = "native") -> dict:
    """Extract text from PDF using different methods.
    
    Args:
        pdf_path: Path to PDF file
        method: 'native' (text layer) or 'ocr' (image-based)
    """
    if method == "native":
        # Extract from text layer (faster, but only works if PDF has text)
        doc = fitz.open(pdf_path)
        pages_text = []
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            pages_text.append({
                "page": page_num + 1,
                "text": text
            })
        
        doc.close()
        return {"pages": pages_text}
    
    else:  # OCR method
        # Convert PDF pages to images
        images = convert_from_path(pdf_path, dpi=300)
        pages_text = []
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            pages_text.append({
                "page": i + 1,
                "text": text
            })
        
        return {"pages": pages_text}

# Hybrid approach: try native first, fallback to OCR
def extract_pdf_hybrid(pdf_path: str) -> dict:
    """Try native extraction first, use OCR if text is sparse."""
    doc = fitz.open(pdf_path)
    pages_text = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        native_text = page.get_text()
        
        # If text is too short, likely scanned image
        if len(native_text.strip()) < 100:
            # Convert page to image and OCR
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text = pytesseract.image_to_string(img)
            pages_text.append({
                "page": page_num + 1,
                "text": ocr_text,
                "method": "ocr"
            })
        else:
            pages_text.append({
                "page": page_num + 1,
                "text": native_text,
                "method": "native"
            })
    
    doc.close()
    return {"pages": pages_text}

python

import cv2
import numpy as np
from PIL import Image
import pytesseract

def detect_tables(image_path: str) -> list:
    """Detect table boundaries in an image."""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Detect horizontal lines
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
    detected_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
    cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    
    # Detect vertical lines
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
    detected_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts2 = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts2 = cnts2[0] if len(cnts2) == 2 else cnts2[1]
    
    # Combine contours
    all_cnts = list(cnts) + list(cnts2)
    
    # Find bounding boxes
    tables = []
    for cnt in all_cnts:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 100 and h > 50:  # Filter small detections
            tables.append({"x": x, "y": y, "width": w, "height": h})
    
    return tables

def extract_table_cells(image_path: str, table_bbox: dict) -> list[list[str]]:
    """Extract text from table cells."""
    img = cv2.imread(image_path)
    
    # Crop table region
    x, y, w, h = table_bbox["x"], table_bbox["y"], table_bbox["width"], table_bbox["height"]
    table_img = img[y:y+h, x:x+w]
    
    # Convert to PIL
    pil_img = Image.fromarray(cv2.cvtColor(table_img, cv2.COLOR_BGR2RGB))
    
    # Use Tesseract with table structure detection
    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(pil_img, config=custom_config)
    
    # Parse into rows (simple approach)
    rows = []
    for line in text.strip().split('\n'):
        if line.strip():
            # Split by multiple spaces (assuming tabular format)
            cells = [cell.strip() for cell in line.split('  ') if cell.strip()]
            if cells:
                rows.append(cells)
    
    return rows

# Using unstructured library for better table extraction
def extract_tables_unstructured(pdf_path: str) -> list:
    """Extract tables using unstructured library."""
    from unstructured.partition.pdf import partition_pdf
    
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True
    )
    
    tables = []
    for element in elements:
        if hasattr(element, "metadata") and element.metadata.text_as_html:
            tables.append({
                "text": element.text,
                "html": element.metadata.text_as_html,
                "type": "table"
            })
    
    return tables

python

from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

def analyze_document_layout(pdf_path: str) -> dict:
    """Analyze document structure and layout."""
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True,
        include_page_breaks=True
    )
    
    layout = {
        "pages": [],
        "tables": [],
        "titles": [],
        "paragraphs": []
    }
    
    current_page = 1
    for element in elements:
        element_type = type(element).__name__
        
        if "PageBreak" in element_type:
            current_page += 1
            continue
        
        element_data = {
            "page": current_page,
            "type": element_type,
            "text": element.text[:200] if element.text else "",
        }
        
        if element_type == "Table":
            layout["tables"].append(element_data)
            if hasattr(element.metadata, "text_as_html"):
                element_data["html"] = element.metadata.text_as_html
        elif element_type == "Title":
            layout["titles"].append(element_data)
        elif element_type == "NarrativeText":
            layout["paragraphs"].append(element_data)
    
    return layout

def chunk_by_sections(pdf_path: str) -> list:
    """Chunk document by sections using title detection."""
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True
    )
    
    chunks = chunk_by_title(
        elements,
        max_characters=2000,
        combine_text_under_n_chars=500
    )
    
    return [
        {
            "text": chunk.text,
            "metadata": chunk.metadata.to_dict() if hasattr(chunk, "metadata") else {}
        }
        for chunk in chunks
    ]

python

from google.cloud import vision
import boto3
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials

def extract_text_google_vision(image_path: str, credentials_path: str) -> str:
    """Extract text using Google Cloud Vision API."""
    client = vision.ImageAnnotatorClient.from_service_account_file(credentials_path)
    
    with open(image_path, "rb") as image_file:
        content = image_file.read()
    
    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    
    if texts:
        return texts[0].description
    return ""

def extract_text_aws_textract(image_path: str, aws_region: str = "us-east-1") -> dict:
    """Extract text using AWS Textract."""
    textract = boto3.client("textract", region_name=aws_region)
    
    with open(image_path, "rb") as document:
        response = textract.detect_document_text(
            Document={"Bytes": document.read()}
        )
    
    blocks = response["Blocks"]
    text_blocks = [
        block["Text"]
        for block in blocks
        if block["BlockType"] == "LINE"
    ]
    
    return {
        "text": "\n".join(text_blocks),
        "blocks": blocks
    }

def extract_text_azure_vision(image_path: str, endpoint: str, key: str) -> str:
    """Extract text using Azure Computer Vision."""
    client = ComputerVisionClient(
        endpoint=endpoint,
        credentials=CognitiveServicesCredentials(key)
    )
    
    with open(image_path, "rb") as image_stream:
        ocr_result = client.read_in_stream(image_stream, raw=True)
    
    # Get operation ID
    operation_id = ocr_result.headers["Operation-Location"].split("/")[-1]
    
    # Wait for result
    import time
    while True:
        result = client.get_read_result(operation_id)
        if result.status not in ["notStarted", "running"]:
            break
        time.sleep(1)
    
    # Extract text
    text_lines = []
    if result.status == "succeeded":
        for page in result.analyze_result.read_results:
            for line in page.lines:
                text_lines.append(line.text)
    
    return "\n".join(text_lines)

OCR Engines Comparison

Engine	Accuracy	Speed	Languages	Cost
Tesseract	Medium	Fast	100+	Free
EasyOCR	High	Medium	80+	Free
Google Vision	Very High	Fast	50+	Paid
AWS Textract	Very High	Fast	50+	Paid
Azure Vision	Very High	Fast	50+	Paid

Best Practices

•Preprocess images (grayscale, thresholding, denoising) before OCR
•Use native PDF text extraction when available, OCR as fallback
•Set appropriate DPI (300+) when converting PDFs to images
•Filter low-confidence detections to reduce noise
•Use layout analysis to preserve document structure
•Cache OCR results for repeated processing
•Choose engine based on accuracy needs vs. cost constraints
•Handle multi-language documents with language detection

Anti-Patterns

Anti-Pattern	Fix
OCR without preprocessing	Apply grayscale, thresholding, denoising
Low DPI PDF conversion	Use 300+ DPI for better accuracy
Ignoring confidence scores	Filter results below threshold
No fallback for native PDF	Try native first, OCR if sparse
Processing entire page	Crop to regions of interest
No layout preservation	Use structured extraction tools
Single language assumption	Detect and specify language codes

•Skill: vision-agents
•Skill: rag-patterns
•Skill: advanced-retrieval

Prerequisites

[!IMPORTANT] Requirements:

•Packages: pytesseract, pillow, pdf2image, easyocr, unstructured[pdf], pymupdf, opencv-python

Ocr Processing

OCR Processing Skill

OCR Processing Skill

Process

Step 1: Basic Tesseract OCR

Step 2: EasyOCR for Better Accuracy

Step 3: PDF Text Extraction

Step 4: Table Recognition and Extraction

Step 5: Layout Analysis

Step 6: Cloud OCR APIs Integration

OCR Engines Comparison

Best Practices

Anti-Patterns

Related

Prerequisites