Testing & Validation Skill

Description

Manages testing strategies, validation workflows, and data quality checks for the OpenGov Harvester.

Triggers

•"run tests"
•"validate extraction"
•"check data quality"
•"test pipeline"
•"integration test"

Test Organization

Directory Structure

code

tests/
├── unit/              # Unit tests for individual modules
│   ├── test_database.py
│   ├── test_extraction.py
│   └── test_utils.py
├── integration/       # Integration tests for workflows
│   ├── test_etl_pipeline.py
│   └── test_database_sync.py
├── e2e/               # End-to-end browser automation tests
│   ├── test_login_flow.py
│   └── test_extraction_flow.py
├── fixtures/          # Test data and fixtures
│   ├── sample_projects.json
│   └── mock_responses.json
└── conftest.py        # Pytest configuration and fixtures

Running Tests

All Tests

bash

# Run complete test suite
./run_tests.sh

# Or directly with pytest
pytest tests/ -v

# With coverage
pytest tests/ --cov=src --cov-report=html
open htmlcov/index.html

Specific Test Types

bash

# Unit tests only
pytest tests/unit/ -v

# Integration tests only
pytest tests/integration/ -v

# E2E tests only
pytest tests/e2e/ -v

# Specific test file
pytest tests/unit/test_database.py -v

# Specific test function
pytest tests/unit/test_database.py::test_connection -v

Test Filtering

bash

# By marker
pytest -m "slow" -v          # Run slow tests
pytest -m "not slow" -v      # Skip slow tests

# By keyword
pytest -k "database" -v      # Tests matching "database"
pytest -k "not integration" -v  # Skip integration tests

Test Writing Conventions

Unit Test Example

python

# tests/unit/test_database.py
import pytest
import sqlite3
from src.database.connection import get_connection, init_database

@pytest.fixture
def test_db(tmp_path):
    """Create temporary test database"""
    db_path = tmp_path / "test.db"
    conn = sqlite3.connect(str(db_path))
    init_database(conn)
    yield conn
    conn.close()

def test_connection_wal_mode(test_db):
    """Verify WAL mode is enabled"""
    cursor = test_db.execute("PRAGMA journal_mode")
    mode = cursor.fetchone()[0]
    assert mode == "wal", "Database should use WAL mode"

def test_insert_project(test_db):
    """Test project insertion"""
    test_db.execute(
        "INSERT INTO opengov_projects (project_id, name) VALUES (?, ?)",
        ("test-123", "Test Project")
    )

    result = test_db.execute(
        "SELECT name FROM opengov_projects WHERE project_id = ?",
        ("test-123",)
    ).fetchone()

    assert result[0] == "Test Project"

Integration Test Example

python

# tests/integration/test_etl_pipeline.py
import pytest
from src.etl.pipeline import run_extraction

@pytest.fixture
def mock_supabase(mocker):
    """Mock Supabase client"""
    return mocker.patch('src.etl.pipeline.supabase')

@pytest.mark.integration
def test_full_extraction_pipeline(test_db, mock_supabase):
    """Test complete extraction pipeline"""
    # Setup
    project_id = "test-project-123"

    # Execute
    result = run_extraction(project_id, db=test_db)

    # Verify
    assert result['success'] is True
    assert result['opportunities_extracted'] > 0

    # Check database state
    cursor = test_db.execute(
        "SELECT extracted FROM opengov_projects WHERE project_id = ?",
        (project_id,)
    )
    assert cursor.fetchone()[0] == 1

E2E Test Example

python

# tests/e2e/test_extraction_flow.py
import pytest
from playwright.sync_api import Page, expect

@pytest.mark.e2e
@pytest.mark.slow
def test_login_and_extract(page: Page):
    """Test full login and extraction flow"""
    # Navigate to login
    page.goto("https://opengov.example.com/login")

    # Login
    page.fill("#username", "test@example.com")
    page.fill("#password", "test-password")
    page.click("button[type='submit']")

    # Wait for navigation
    page.wait_for_url("**/dashboard")

    # Verify logged in
    expect(page.locator(".user-name")).to_contain_text("Test User")

    # Navigate to projects
    page.click("text=Projects")
    expect(page.locator(".project-list")).to_be_visible()

    # Count projects
    project_count = page.locator(".project-item").count()
    assert project_count > 0, "Should have at least one project"

Data Quality Checks

Validation Functions

python

# src/validation/quality_checks.py

def validate_project_data(project: dict) -> tuple[bool, list[str]]:
    """Validate project data completeness and integrity"""
    errors = []

    # Required fields
    required = ['project_id', 'name', 'created_at']
    for field in required:
        if field not in project or not project[field]:
            errors.append(f"Missing required field: {field}")

    # Data types
    if not isinstance(project.get('version'), int):
        errors.append("Version must be an integer")

    # Business logic
    if project.get('opportunities_count', 0) < 0:
        errors.append("Opportunities count cannot be negative")

    return len(errors) == 0, errors

def validate_extraction_results(results: dict) -> bool:
    """Validate extraction results"""
    checks = [
        results.get('success') is True,
        results.get('opportunities_extracted', 0) > 0,
        results.get('documents_downloaded', 0) >= 0,
        results.get('errors', []) == [],
    ]
    return all(checks)

Running Quality Checks

bash

# Run validation script
python scripts/validate_data_quality.py

# Check specific project
python scripts/validate_data_quality.py --project-id abc123

# Generate report
python scripts/validate_data_quality.py --report-format html > quality_report.html

Test Data & Fixtures

Pytest Fixtures

python

# tests/conftest.py
import pytest
from pathlib import Path

@pytest.fixture
def sample_projects():
    """Load sample project data"""
    fixture_path = Path(__file__).parent / "fixtures" / "sample_projects.json"
    with open(fixture_path) as f:
        return json.load(f)

@pytest.fixture
def mock_playwright_page(mocker):
    """Mock Playwright page object"""
    page = mocker.Mock()
    page.goto.return_value = None
    page.fill.return_value = None
    page.click.return_value = None
    return page

@pytest.fixture(scope="session")
def test_database():
    """Session-scoped test database"""
    db_path = "tests/test_data.db"
    conn = sqlite3.connect(db_path)
    init_database(conn)
    yield conn
    conn.close()
    os.remove(db_path)

Mock Data

json

// tests/fixtures/sample_projects.json
[
  {
    "project_id": "test-001",
    "name": "City Infrastructure Project",
    "created_at": "2026-01-01T00:00:00Z",
    "version": 1,
    "opportunities_count": 5,
    "extracted": false
  },
  {
    "project_id": "test-002",
    "name": "Highway Construction",
    "created_at": "2026-01-02T00:00:00Z",
    "version": 1,
    "opportunities_count": 3,
    "extracted": true
  }
]

Related Rules

•python-testing-structure.md: Test organization patterns
•python-modularity-patterns.md: Testable module design
•data-quality-checks.md: Data validation patterns
•etl-idempotency-patterns.md: Idempotent test design

Test Configuration

pytest.ini

ini

[pytest]
minversion = 6.0
testpaths = tests
python_files = test_*.py
python_classes = Test*
python_functions = test_*
markers =
    slow: marks tests as slow (deselect with '-m "not slow"')
    integration: integration tests
    e2e: end-to-end tests
    unit: unit tests
addopts =
    -v
    --strict-markers
    --tb=short
    --disable-warnings

Coverage Configuration

ini

# .coveragerc
[run]
source = src
omit =
    */tests/*
    */venv/*
    */migrations/*

[report]
precision = 2
show_missing = True
skip_covered = False

[html]
directory = htmlcov

Continuous Integration

GitHub Actions Example

yaml

# .github/workflows/test.yml
name: Test Suite

on: [push, pull_request]

jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2

      - name: Set up Python
        uses: actions/setup-python@v2
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: |
          pip install -r requirements.txt
          pip install -r requirements-test.txt
          playwright install

      - name: Run tests
        run: pytest tests/ --cov=src --cov-report=xml

      - name: Upload coverage
        uses: codecov/codecov-action@v2
        with:
          file: ./coverage.xml

Performance Testing

Benchmark Tests

python

import pytest
import time

@pytest.mark.benchmark
def test_extraction_performance(benchmark):
    """Benchmark extraction performance"""
    def run_extraction():
        # Simulate extraction
        time.sleep(0.5)
        return {"success": True, "count": 100}

    result = benchmark(run_extraction)
    assert result['success']
    assert benchmark.stats['mean'] < 1.0  # Should complete in <1s

Load Testing

bash

# Using locust for load testing
pip install locust

# Run load test
locust -f tests/load/extraction_load_test.py --host=http://localhost:8080

Troubleshooting Tests

Common Issues

Import Errors:

bash

# Ensure src is in Python path
export PYTHONPATH="${PYTHONPATH}:$(pwd)/src"

Playwright Errors:

bash

# Install browser binaries
playwright install

# Run with headed mode for debugging
pytest tests/e2e/ --headed

Database Locked:

python

# Use separate test database
@pytest.fixture
def test_db(tmp_path):
    db_path = tmp_path / "test.db"
    conn = sqlite3.connect(str(db_path))
    conn.execute("PRAGMA journal_mode=WAL")
    return conn

Best Practices

Test Naming

python

# Good: Descriptive, follows pattern
def test_extraction_creates_database_record():
    ...

def test_extraction_handles_missing_data_gracefully():
    ...

# Bad: Vague, unclear
def test_function():
    ...

def test_it_works():
    ...

Test Independence

python

# Each test should be independent
def test_insert_project(test_db):
    # Setup
    project_id = "test-123"

    # Execute
    insert_project(test_db, project_id, "Test")

    # Verify
    result = get_project(test_db, project_id)
    assert result is not None

    # Cleanup not needed - test_db fixture is fresh each time

Assertion Messages

python

# Provide clear assertion messages
assert len(results) > 0, f"Expected results but got empty list"
assert project['version'] == 1, f"Expected version 1 but got {project['version']}"