AgentSkillsCN

Ocr Processing

利用多种引擎(Tesseract、EasyOCR、云端 API)实现图像转文字、PDF 文本提取、表格识别与版面分析

SKILL.md
--- frontmatter
description: Image-to-text with various engines (Tesseract, EasyOCR, cloud APIs), PDF text extraction, table recognition, and layout analysis

Ocr Processing

Image-to-text with various engines (Tesseract, EasyOCR, cloud APIs), PDF text extraction, table recognition, and layout analysis

OCR Processing Skill

Extract text from images and PDFs using multiple OCR engines, recognize tables, and analyze document layouts.

OCR Processing Skill

Extract text from images and PDFs using multiple OCR engines, recognize tables, and analyze document layouts.

Process

Step 1: Basic Tesseract OCR

python
import pytesseract
from PIL import Image
import io

def extract_text_tesseract(image_path: str, lang: str = "eng") -> str:
    """Extract text using Tesseract OCR.
    
    Args:
        image_path: Path to image file
        lang: Language code (e.g., 'eng', 'spa', 'fra')
    """
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image, lang=lang)
    return text.strip()

# With image preprocessing
def extract_text_preprocessed(image_path: str) -> str:
    """Extract text with image preprocessing for better accuracy."""
    import cv2
    import numpy as np
    
    # Read image
    img = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply thresholding
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Denoise
    denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
    
    # Convert to PIL Image
    pil_image = Image.fromarray(denoised)
    
    # Extract text
    text = pytesseract.image_to_string(pil_image)
    return text.strip()

# Usage
text = extract_text_tesseract("document.png")

Step 2: EasyOCR for Better Accuracy

python
import easyocr

def extract_text_easyocr(image_path: str, languages: list = ["en"]) -> str:
    """Extract text using EasyOCR (often more accurate than Tesseract).
    
    Args:
        image_path: Path to image file
        languages: List of language codes (e.g., ['en', 'es'])
    """
    reader = easyocr.Reader(languages, gpu=False)
    results = reader.readtext(image_path)
    
    # Combine all detected text
    text = "\n".join([result[1] for result in results])
    return text

# With confidence filtering
def extract_text_with_confidence(image_path: str, min_confidence: float = 0.5) -> list:
    """Extract text with confidence scores and bounding boxes."""
    reader = easyocr.Reader(["en"], gpu=False)
    results = reader.readtext(image_path)
    
    filtered = [
        {
            "text": result[1],
            "confidence": result[2],
            "bbox": result[0]
        }
        for result in results
        if result[2] >= min_confidence
    ]
    return filtered

# Usage
text = extract_text_easyocr("document.png")
detections = extract_text_with_confidence("document.png", min_confidence=0.7)

Step 3: PDF Text Extraction

python
from pdf2image import convert_from_path
import pytesseract
from pymupdf import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path: str, method: str = "native") -> dict:
    """Extract text from PDF using different methods.
    
    Args:
        pdf_path: Path to PDF file
        method: 'native' (text layer) or 'ocr' (image-based)
    """
    if method == "native":
        # Extract from text layer (faster, but only works if PDF has text)
        doc = fitz.open(pdf_path)
        pages_text = []
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            pages_text.append({
                "page": page_num + 1,
                "text": text
            })
        
        doc.close()
        return {"pages": pages_text}
    
    else:  # OCR method
        # Convert PDF pages to images
        images = convert_from_path(pdf_path, dpi=300)
        pages_text = []
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            pages_text.append({
                "page": i + 1,
                "text": text
            })
        
        return {"pages": pages_text}

# Hybrid approach: try native first, fallback to OCR
def extract_pdf_hybrid(pdf_path: str) -> dict:
    """Try native extraction first, use OCR if text is sparse."""
    doc = fitz.open(pdf_path)
    pages_text = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        native_text = page.get_text()
        
        # If text is too short, likely scanned image
        if len(native_text.strip()) < 100:
            # Convert page to image and OCR
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text = pytesseract.image_to_string(img)
            pages_text.append({
                "page": page_num + 1,
                "text": ocr_text,
                "method": "ocr"
            })
        else:
            pages_text.append({
                "page": page_num + 1,
                "text": native_text,
                "method": "native"
            })
    
    doc.close()
    return {"pages": pages_text}

Step 4: Table Recognition and Extraction

python
import cv2
import numpy as np
from PIL import Image
import pytesseract

def detect_tables(image_path: str) -> list:
    """Detect table boundaries in an image."""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Detect horizontal lines
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
    detected_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
    cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    
    # Detect vertical lines
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
    detected_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts2 = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts2 = cnts2[0] if len(cnts2) == 2 else cnts2[1]
    
    # Combine contours
    all_cnts = list(cnts) + list(cnts2)
    
    # Find bounding boxes
    tables = []
    for cnt in all_cnts:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 100 and h > 50:  # Filter small detections
            tables.append({"x": x, "y": y, "width": w, "height": h})
    
    return tables

def extract_table_cells(image_path: str, table_bbox: dict) -> list[list[str]]:
    """Extract text from table cells."""
    img = cv2.imread(image_path)
    
    # Crop table region
    x, y, w, h = table_bbox["x"], table_bbox["y"], table_bbox["width"], table_bbox["height"]
    table_img = img[y:y+h, x:x+w]
    
    # Convert to PIL
    pil_img = Image.fromarray(cv2.cvtColor(table_img, cv2.COLOR_BGR2RGB))
    
    # Use Tesseract with table structure detection
    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(pil_img, config=custom_config)
    
    # Parse into rows (simple approach)
    rows = []
    for line in text.strip().split('\n'):
        if line.strip():
            # Split by multiple spaces (assuming tabular format)
            cells = [cell.strip() for cell in line.split('  ') if cell.strip()]
            if cells:
                rows.append(cells)
    
    return rows

# Using unstructured library for better table extraction
def extract_tables_unstructured(pdf_path: str) -> list:
    """Extract tables using unstructured library."""
    from unstructured.partition.pdf import partition_pdf
    
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True
    )
    
    tables = []
    for element in elements:
        if hasattr(element, "metadata") and element.metadata.text_as_html:
            tables.append({
                "text": element.text,
                "html": element.metadata.text_as_html,
                "type": "table"
            })
    
    return tables

Step 5: Layout Analysis

python
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

def analyze_document_layout(pdf_path: str) -> dict:
    """Analyze document structure and layout."""
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True,
        include_page_breaks=True
    )
    
    layout = {
        "pages": [],
        "tables": [],
        "titles": [],
        "paragraphs": []
    }
    
    current_page = 1
    for element in elements:
        element_type = type(element).__name__
        
        if "PageBreak" in element_type:
            current_page += 1
            continue
        
        element_data = {
            "page": current_page,
            "type": element_type,
            "text": element.text[:200] if element.text else "",
        }
        
        if element_type == "Table":
            layout["tables"].append(element_data)
            if hasattr(element.metadata, "text_as_html"):
                element_data["html"] = element.metadata.text_as_html
        elif element_type == "Title":
            layout["titles"].append(element_data)
        elif element_type == "NarrativeText":
            layout["paragraphs"].append(element_data)
    
    return layout

def chunk_by_sections(pdf_path: str) -> list:
    """Chunk document by sections using title detection."""
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True
    )
    
    chunks = chunk_by_title(
        elements,
        max_characters=2000,
        combine_text_under_n_chars=500
    )
    
    return [
        {
            "text": chunk.text,
            "metadata": chunk.metadata.to_dict() if hasattr(chunk, "metadata") else {}
        }
        for chunk in chunks
    ]

Step 6: Cloud OCR APIs Integration

python
from google.cloud import vision
import boto3
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials

def extract_text_google_vision(image_path: str, credentials_path: str) -> str:
    """Extract text using Google Cloud Vision API."""
    client = vision.ImageAnnotatorClient.from_service_account_file(credentials_path)
    
    with open(image_path, "rb") as image_file:
        content = image_file.read()
    
    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    
    if texts:
        return texts[0].description
    return ""

def extract_text_aws_textract(image_path: str, aws_region: str = "us-east-1") -> dict:
    """Extract text using AWS Textract."""
    textract = boto3.client("textract", region_name=aws_region)
    
    with open(image_path, "rb") as document:
        response = textract.detect_document_text(
            Document={"Bytes": document.read()}
        )
    
    blocks = response["Blocks"]
    text_blocks = [
        block["Text"]
        for block in blocks
        if block["BlockType"] == "LINE"
    ]
    
    return {
        "text": "\n".join(text_blocks),
        "blocks": blocks
    }

def extract_text_azure_vision(image_path: str, endpoint: str, key: str) -> str:
    """Extract text using Azure Computer Vision."""
    client = ComputerVisionClient(
        endpoint=endpoint,
        credentials=CognitiveServicesCredentials(key)
    )
    
    with open(image_path, "rb") as image_stream:
        ocr_result = client.read_in_stream(image_stream, raw=True)
    
    # Get operation ID
    operation_id = ocr_result.headers["Operation-Location"].split("/")[-1]
    
    # Wait for result
    import time
    while True:
        result = client.get_read_result(operation_id)
        if result.status not in ["notStarted", "running"]:
            break
        time.sleep(1)
    
    # Extract text
    text_lines = []
    if result.status == "succeeded":
        for page in result.analyze_result.read_results:
            for line in page.lines:
                text_lines.append(line.text)
    
    return "\n".join(text_lines)
python
import pytesseract
from PIL import Image
import io

def extract_text_tesseract(image_path: str, lang: str = "eng") -> str:
    """Extract text using Tesseract OCR.
    
    Args:
        image_path: Path to image file
        lang: Language code (e.g., 'eng', 'spa', 'fra')
    """
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image, lang=lang)
    return text.strip()

# With image preprocessing
def extract_text_preprocessed(image_path: str) -> str:
    """Extract text with image preprocessing for better accuracy."""
    import cv2
    import numpy as np
    
    # Read image
    img = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply thresholding
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Denoise
    denoised = cv2.fastNlMeansDenoising(thresh, None, 10, 7, 21)
    
    # Convert to PIL Image
    pil_image = Image.fromarray(denoised)
    
    # Extract text
    text = pytesseract.image_to_string(pil_image)
    return text.strip()

# Usage
text = extract_text_tesseract("document.png")
python
import easyocr

def extract_text_easyocr(image_path: str, languages: list = ["en"]) -> str:
    """Extract text using EasyOCR (often more accurate than Tesseract).
    
    Args:
        image_path: Path to image file
        languages: List of language codes (e.g., ['en', 'es'])
    """
    reader = easyocr.Reader(languages, gpu=False)
    results = reader.readtext(image_path)
    
    # Combine all detected text
    text = "\n".join([result[1] for result in results])
    return text

# With confidence filtering
def extract_text_with_confidence(image_path: str, min_confidence: float = 0.5) -> list:
    """Extract text with confidence scores and bounding boxes."""
    reader = easyocr.Reader(["en"], gpu=False)
    results = reader.readtext(image_path)
    
    filtered = [
        {
            "text": result[1],
            "confidence": result[2],
            "bbox": result[0]
        }
        for result in results
        if result[2] >= min_confidence
    ]
    return filtered

# Usage
text = extract_text_easyocr("document.png")
detections = extract_text_with_confidence("document.png", min_confidence=0.7)
python
from pdf2image import convert_from_path
import pytesseract
from pymupdf import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path: str, method: str = "native") -> dict:
    """Extract text from PDF using different methods.
    
    Args:
        pdf_path: Path to PDF file
        method: 'native' (text layer) or 'ocr' (image-based)
    """
    if method == "native":
        # Extract from text layer (faster, but only works if PDF has text)
        doc = fitz.open(pdf_path)
        pages_text = []
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            pages_text.append({
                "page": page_num + 1,
                "text": text
            })
        
        doc.close()
        return {"pages": pages_text}
    
    else:  # OCR method
        # Convert PDF pages to images
        images = convert_from_path(pdf_path, dpi=300)
        pages_text = []
        
        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            pages_text.append({
                "page": i + 1,
                "text": text
            })
        
        return {"pages": pages_text}

# Hybrid approach: try native first, fallback to OCR
def extract_pdf_hybrid(pdf_path: str) -> dict:
    """Try native extraction first, use OCR if text is sparse."""
    doc = fitz.open(pdf_path)
    pages_text = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        native_text = page.get_text()
        
        # If text is too short, likely scanned image
        if len(native_text.strip()) < 100:
            # Convert page to image and OCR
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            ocr_text = pytesseract.image_to_string(img)
            pages_text.append({
                "page": page_num + 1,
                "text": ocr_text,
                "method": "ocr"
            })
        else:
            pages_text.append({
                "page": page_num + 1,
                "text": native_text,
                "method": "native"
            })
    
    doc.close()
    return {"pages": pages_text}
python
import cv2
import numpy as np
from PIL import Image
import pytesseract

def detect_tables(image_path: str) -> list:
    """Detect table boundaries in an image."""
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Detect horizontal lines
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
    detected_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
    cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    
    # Detect vertical lines
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
    detected_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
    cnts2 = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts2 = cnts2[0] if len(cnts2) == 2 else cnts2[1]
    
    # Combine contours
    all_cnts = list(cnts) + list(cnts2)
    
    # Find bounding boxes
    tables = []
    for cnt in all_cnts:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 100 and h > 50:  # Filter small detections
            tables.append({"x": x, "y": y, "width": w, "height": h})
    
    return tables

def extract_table_cells(image_path: str, table_bbox: dict) -> list[list[str]]:
    """Extract text from table cells."""
    img = cv2.imread(image_path)
    
    # Crop table region
    x, y, w, h = table_bbox["x"], table_bbox["y"], table_bbox["width"], table_bbox["height"]
    table_img = img[y:y+h, x:x+w]
    
    # Convert to PIL
    pil_img = Image.fromarray(cv2.cvtColor(table_img, cv2.COLOR_BGR2RGB))
    
    # Use Tesseract with table structure detection
    custom_config = r'--oem 3 --psm 6'
    text = pytesseract.image_to_string(pil_img, config=custom_config)
    
    # Parse into rows (simple approach)
    rows = []
    for line in text.strip().split('\n'):
        if line.strip():
            # Split by multiple spaces (assuming tabular format)
            cells = [cell.strip() for cell in line.split('  ') if cell.strip()]
            if cells:
                rows.append(cells)
    
    return rows

# Using unstructured library for better table extraction
def extract_tables_unstructured(pdf_path: str) -> list:
    """Extract tables using unstructured library."""
    from unstructured.partition.pdf import partition_pdf
    
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True
    )
    
    tables = []
    for element in elements:
        if hasattr(element, "metadata") and element.metadata.text_as_html:
            tables.append({
                "text": element.text,
                "html": element.metadata.text_as_html,
                "type": "table"
            })
    
    return tables
python
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

def analyze_document_layout(pdf_path: str) -> dict:
    """Analyze document structure and layout."""
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True,
        include_page_breaks=True
    )
    
    layout = {
        "pages": [],
        "tables": [],
        "titles": [],
        "paragraphs": []
    }
    
    current_page = 1
    for element in elements:
        element_type = type(element).__name__
        
        if "PageBreak" in element_type:
            current_page += 1
            continue
        
        element_data = {
            "page": current_page,
            "type": element_type,
            "text": element.text[:200] if element.text else "",
        }
        
        if element_type == "Table":
            layout["tables"].append(element_data)
            if hasattr(element.metadata, "text_as_html"):
                element_data["html"] = element.metadata.text_as_html
        elif element_type == "Title":
            layout["titles"].append(element_data)
        elif element_type == "NarrativeText":
            layout["paragraphs"].append(element_data)
    
    return layout

def chunk_by_sections(pdf_path: str) -> list:
    """Chunk document by sections using title detection."""
    elements = partition_pdf(
        filename=pdf_path,
        strategy="hi_res",
        infer_table_structure=True
    )
    
    chunks = chunk_by_title(
        elements,
        max_characters=2000,
        combine_text_under_n_chars=500
    )
    
    return [
        {
            "text": chunk.text,
            "metadata": chunk.metadata.to_dict() if hasattr(chunk, "metadata") else {}
        }
        for chunk in chunks
    ]
python
from google.cloud import vision
import boto3
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials

def extract_text_google_vision(image_path: str, credentials_path: str) -> str:
    """Extract text using Google Cloud Vision API."""
    client = vision.ImageAnnotatorClient.from_service_account_file(credentials_path)
    
    with open(image_path, "rb") as image_file:
        content = image_file.read()
    
    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    texts = response.text_annotations
    
    if texts:
        return texts[0].description
    return ""

def extract_text_aws_textract(image_path: str, aws_region: str = "us-east-1") -> dict:
    """Extract text using AWS Textract."""
    textract = boto3.client("textract", region_name=aws_region)
    
    with open(image_path, "rb") as document:
        response = textract.detect_document_text(
            Document={"Bytes": document.read()}
        )
    
    blocks = response["Blocks"]
    text_blocks = [
        block["Text"]
        for block in blocks
        if block["BlockType"] == "LINE"
    ]
    
    return {
        "text": "\n".join(text_blocks),
        "blocks": blocks
    }

def extract_text_azure_vision(image_path: str, endpoint: str, key: str) -> str:
    """Extract text using Azure Computer Vision."""
    client = ComputerVisionClient(
        endpoint=endpoint,
        credentials=CognitiveServicesCredentials(key)
    )
    
    with open(image_path, "rb") as image_stream:
        ocr_result = client.read_in_stream(image_stream, raw=True)
    
    # Get operation ID
    operation_id = ocr_result.headers["Operation-Location"].split("/")[-1]
    
    # Wait for result
    import time
    while True:
        result = client.get_read_result(operation_id)
        if result.status not in ["notStarted", "running"]:
            break
        time.sleep(1)
    
    # Extract text
    text_lines = []
    if result.status == "succeeded":
        for page in result.analyze_result.read_results:
            for line in page.lines:
                text_lines.append(line.text)
    
    return "\n".join(text_lines)

OCR Engines Comparison

EngineAccuracySpeedLanguagesCost
TesseractMediumFast100+Free
EasyOCRHighMedium80+Free
Google VisionVery HighFast50+Paid
AWS TextractVery HighFast50+Paid
Azure VisionVery HighFast50+Paid

Best Practices

  • Preprocess images (grayscale, thresholding, denoising) before OCR
  • Use native PDF text extraction when available, OCR as fallback
  • Set appropriate DPI (300+) when converting PDFs to images
  • Filter low-confidence detections to reduce noise
  • Use layout analysis to preserve document structure
  • Cache OCR results for repeated processing
  • Choose engine based on accuracy needs vs. cost constraints
  • Handle multi-language documents with language detection

Anti-Patterns

Anti-PatternFix
OCR without preprocessingApply grayscale, thresholding, denoising
Low DPI PDF conversionUse 300+ DPI for better accuracy
Ignoring confidence scoresFilter results below threshold
No fallback for native PDFTry native first, OCR if sparse
Processing entire pageCrop to regions of interest
No layout preservationUse structured extraction tools
Single language assumptionDetect and specify language codes

Related

  • Skill: vision-agents
  • Skill: rag-patterns
  • Skill: advanced-retrieval

Prerequisites

[!IMPORTANT] Requirements:

  • Packages: pytesseract, pillow, pdf2image, easyocr, unstructured[pdf], pymupdf, opencv-python