Testing & Validation Skill
Description
Manages testing strategies, validation workflows, and data quality checks for the OpenGov Harvester.
Triggers
- •"run tests"
- •"validate extraction"
- •"check data quality"
- •"test pipeline"
- •"integration test"
Test Organization
Directory Structure
code
tests/ ├── unit/ # Unit tests for individual modules │ ├── test_database.py │ ├── test_extraction.py │ └── test_utils.py ├── integration/ # Integration tests for workflows │ ├── test_etl_pipeline.py │ └── test_database_sync.py ├── e2e/ # End-to-end browser automation tests │ ├── test_login_flow.py │ └── test_extraction_flow.py ├── fixtures/ # Test data and fixtures │ ├── sample_projects.json │ └── mock_responses.json └── conftest.py # Pytest configuration and fixtures
Running Tests
All Tests
bash
# Run complete test suite ./run_tests.sh # Or directly with pytest pytest tests/ -v # With coverage pytest tests/ --cov=src --cov-report=html open htmlcov/index.html
Specific Test Types
bash
# Unit tests only pytest tests/unit/ -v # Integration tests only pytest tests/integration/ -v # E2E tests only pytest tests/e2e/ -v # Specific test file pytest tests/unit/test_database.py -v # Specific test function pytest tests/unit/test_database.py::test_connection -v
Test Filtering
bash
# By marker pytest -m "slow" -v # Run slow tests pytest -m "not slow" -v # Skip slow tests # By keyword pytest -k "database" -v # Tests matching "database" pytest -k "not integration" -v # Skip integration tests
Test Writing Conventions
Unit Test Example
python
# tests/unit/test_database.py
import pytest
import sqlite3
from src.database.connection import get_connection, init_database
@pytest.fixture
def test_db(tmp_path):
"""Create temporary test database"""
db_path = tmp_path / "test.db"
conn = sqlite3.connect(str(db_path))
init_database(conn)
yield conn
conn.close()
def test_connection_wal_mode(test_db):
"""Verify WAL mode is enabled"""
cursor = test_db.execute("PRAGMA journal_mode")
mode = cursor.fetchone()[0]
assert mode == "wal", "Database should use WAL mode"
def test_insert_project(test_db):
"""Test project insertion"""
test_db.execute(
"INSERT INTO opengov_projects (project_id, name) VALUES (?, ?)",
("test-123", "Test Project")
)
result = test_db.execute(
"SELECT name FROM opengov_projects WHERE project_id = ?",
("test-123",)
).fetchone()
assert result[0] == "Test Project"
Integration Test Example
python
# tests/integration/test_etl_pipeline.py
import pytest
from src.etl.pipeline import run_extraction
@pytest.fixture
def mock_supabase(mocker):
"""Mock Supabase client"""
return mocker.patch('src.etl.pipeline.supabase')
@pytest.mark.integration
def test_full_extraction_pipeline(test_db, mock_supabase):
"""Test complete extraction pipeline"""
# Setup
project_id = "test-project-123"
# Execute
result = run_extraction(project_id, db=test_db)
# Verify
assert result['success'] is True
assert result['opportunities_extracted'] > 0
# Check database state
cursor = test_db.execute(
"SELECT extracted FROM opengov_projects WHERE project_id = ?",
(project_id,)
)
assert cursor.fetchone()[0] == 1
E2E Test Example
python
# tests/e2e/test_extraction_flow.py
import pytest
from playwright.sync_api import Page, expect
@pytest.mark.e2e
@pytest.mark.slow
def test_login_and_extract(page: Page):
"""Test full login and extraction flow"""
# Navigate to login
page.goto("https://opengov.example.com/login")
# Login
page.fill("#username", "test@example.com")
page.fill("#password", "test-password")
page.click("button[type='submit']")
# Wait for navigation
page.wait_for_url("**/dashboard")
# Verify logged in
expect(page.locator(".user-name")).to_contain_text("Test User")
# Navigate to projects
page.click("text=Projects")
expect(page.locator(".project-list")).to_be_visible()
# Count projects
project_count = page.locator(".project-item").count()
assert project_count > 0, "Should have at least one project"
Data Quality Checks
Validation Functions
python
# src/validation/quality_checks.py
def validate_project_data(project: dict) -> tuple[bool, list[str]]:
"""Validate project data completeness and integrity"""
errors = []
# Required fields
required = ['project_id', 'name', 'created_at']
for field in required:
if field not in project or not project[field]:
errors.append(f"Missing required field: {field}")
# Data types
if not isinstance(project.get('version'), int):
errors.append("Version must be an integer")
# Business logic
if project.get('opportunities_count', 0) < 0:
errors.append("Opportunities count cannot be negative")
return len(errors) == 0, errors
def validate_extraction_results(results: dict) -> bool:
"""Validate extraction results"""
checks = [
results.get('success') is True,
results.get('opportunities_extracted', 0) > 0,
results.get('documents_downloaded', 0) >= 0,
results.get('errors', []) == [],
]
return all(checks)
Running Quality Checks
bash
# Run validation script python scripts/validate_data_quality.py # Check specific project python scripts/validate_data_quality.py --project-id abc123 # Generate report python scripts/validate_data_quality.py --report-format html > quality_report.html
Test Data & Fixtures
Pytest Fixtures
python
# tests/conftest.py
import pytest
from pathlib import Path
@pytest.fixture
def sample_projects():
"""Load sample project data"""
fixture_path = Path(__file__).parent / "fixtures" / "sample_projects.json"
with open(fixture_path) as f:
return json.load(f)
@pytest.fixture
def mock_playwright_page(mocker):
"""Mock Playwright page object"""
page = mocker.Mock()
page.goto.return_value = None
page.fill.return_value = None
page.click.return_value = None
return page
@pytest.fixture(scope="session")
def test_database():
"""Session-scoped test database"""
db_path = "tests/test_data.db"
conn = sqlite3.connect(db_path)
init_database(conn)
yield conn
conn.close()
os.remove(db_path)
Mock Data
json
// tests/fixtures/sample_projects.json
[
{
"project_id": "test-001",
"name": "City Infrastructure Project",
"created_at": "2026-01-01T00:00:00Z",
"version": 1,
"opportunities_count": 5,
"extracted": false
},
{
"project_id": "test-002",
"name": "Highway Construction",
"created_at": "2026-01-02T00:00:00Z",
"version": 1,
"opportunities_count": 3,
"extracted": true
}
]
Related Rules
- •
python-testing-structure.md: Test organization patterns - •
python-modularity-patterns.md: Testable module design - •
data-quality-checks.md: Data validation patterns - •
etl-idempotency-patterns.md: Idempotent test design
Test Configuration
pytest.ini
ini
[pytest]
minversion = 6.0
testpaths = tests
python_files = test_*.py
python_classes = Test*
python_functions = test_*
markers =
slow: marks tests as slow (deselect with '-m "not slow"')
integration: integration tests
e2e: end-to-end tests
unit: unit tests
addopts =
-v
--strict-markers
--tb=short
--disable-warnings
Coverage Configuration
ini
# .coveragerc
[run]
source = src
omit =
*/tests/*
*/venv/*
*/migrations/*
[report]
precision = 2
show_missing = True
skip_covered = False
[html]
directory = htmlcov
Continuous Integration
GitHub Actions Example
yaml
# .github/workflows/test.yml
name: Test Suite
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.10'
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install -r requirements-test.txt
playwright install
- name: Run tests
run: pytest tests/ --cov=src --cov-report=xml
- name: Upload coverage
uses: codecov/codecov-action@v2
with:
file: ./coverage.xml
Performance Testing
Benchmark Tests
python
import pytest
import time
@pytest.mark.benchmark
def test_extraction_performance(benchmark):
"""Benchmark extraction performance"""
def run_extraction():
# Simulate extraction
time.sleep(0.5)
return {"success": True, "count": 100}
result = benchmark(run_extraction)
assert result['success']
assert benchmark.stats['mean'] < 1.0 # Should complete in <1s
Load Testing
bash
# Using locust for load testing pip install locust # Run load test locust -f tests/load/extraction_load_test.py --host=http://localhost:8080
Troubleshooting Tests
Common Issues
Import Errors:
bash
# Ensure src is in Python path
export PYTHONPATH="${PYTHONPATH}:$(pwd)/src"
Playwright Errors:
bash
# Install browser binaries playwright install # Run with headed mode for debugging pytest tests/e2e/ --headed
Database Locked:
python
# Use separate test database
@pytest.fixture
def test_db(tmp_path):
db_path = tmp_path / "test.db"
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA journal_mode=WAL")
return conn
Best Practices
Test Naming
python
# Good: Descriptive, follows pattern
def test_extraction_creates_database_record():
...
def test_extraction_handles_missing_data_gracefully():
...
# Bad: Vague, unclear
def test_function():
...
def test_it_works():
...
Test Independence
python
# Each test should be independent
def test_insert_project(test_db):
# Setup
project_id = "test-123"
# Execute
insert_project(test_db, project_id, "Test")
# Verify
result = get_project(test_db, project_id)
assert result is not None
# Cleanup not needed - test_db fixture is fresh each time
Assertion Messages
python
# Provide clear assertion messages
assert len(results) > 0, f"Expected results but got empty list"
assert project['version'] == 1, f"Expected version 1 but got {project['version']}"