Web Scraping Automation

Automated browser-based data extraction and interaction using Playwright. Perfect for monitoring websites, extracting data, and automating web workflows.

When to Use

•Extracting data from websites without APIs
•Monitoring website changes
•Automating form submissions
•Taking scheduled screenshots
•Testing web applications
•Price monitoring and alerting
•Content aggregation

Quick Start

bash

# Setup scraping environment
./setup.sh

# Run a scraper
./scrape.sh run examples/hacker-news.js

# Schedule recurring scrape
./scrape.sh schedule --url "https://example.com" --every 1h

# Take screenshot
./scrape.sh screenshot https://example.com page.png

Installation

Prerequisites

bash

# Install Playwright
npm init -y
npm install playwright
npx playwright install chromium

# Or use Docker
docker run -it --rm mcr.microsoft.com/playwright:v1.40.0-jammy

Basic Scraping

Simple Page Scraper

javascript

// scrapers/basic.js
const { chromium } = require('playwright');

async function scrape(url) {
    const browser = await chromium.launch({ headless: true });
    const context = await browser.newContext();
    const page = await context.newPage();
    
    await page.goto(url);
    
    // Extract data
    const data = await page.evaluate(() => {
        return {
            title: document.title,
            headings: Array.from(document.querySelectorAll('h1, h2, h3'))
                .map(h => h.textContent.trim()),
            links: Array.from(document.querySelectorAll('a[href]'))
                .map(a => ({ text: a.textContent, href: a.href }))
                .slice(0, 10)
        };
    });
    
    await browser.close();
    return data;
}

scrape('https://example.com')
    .then(data => console.log(JSON.stringify(data, null, 2)));

Run the Scraper

bash

node scrapers/basic.js

Advanced Scraping Patterns

Login and Session Management

javascript

// scrapers/authenticated.js
const { chromium } = require('playwright');

class AuthenticatedScraper {
    constructor() {
        this.browser = null;
        this.context = null;
        this.page = null;
    }
    
    async init() {
        this.browser = await chromium.launch({ headless: true });
        this.context = await this.browser.newContext();
        this.page = await this.context.newPage();
    }
    
    async login(credentials) {
        await this.page.goto('https://example.com/login');
        
        // Fill login form
        await this.page.fill('input[name="email"]', credentials.email);
        await this.page.fill('input[name="password"]', credentials.password);
        
        // Submit and wait for navigation
        await Promise.all([
            this.page.waitForNavigation(),
            this.page.click('button[type="submit"]')
        ]);
        
        // Verify login success
        const loggedIn = await this.page.locator('.user-profile').isVisible();
        if (!loggedIn) throw new Error('Login failed');
        
        // Save session
        await this.context.storageState({ path: 'auth.json' });
    }
    
    async scrapeDashboard() {
        await this.page.goto('https://example.com/dashboard');
        
        return await this.page.evaluate(() => ({
            stats: document.querySelector('.stats')?.textContent,
            notifications: Array.from(document.querySelectorAll('.notification'))
                .map(n => n.textContent.trim())
        }));
    }
    
    async close() {
        await this.browser.close();
    }
}

// Usage
(async () => {
    const scraper = new AuthenticatedScraper();
    await scraper.init();
    await scraper.login({ email: 'user@example.com', password: 'secret' });
    const data = await scraper.scrapeDashboard();
    console.log(data);
    await scraper.close();
})();

Pagination Handling

javascript

// scrapers/paginated.js
async function scrapePaginated(url) {
    const browser = await chromium.launch();
    const page = await browser.newPage();
    
    const allItems = [];
    let currentPage = 1;
    let hasNextPage = true;
    
    await page.goto(url);
    
    while (hasNextPage && currentPage <= 10) {
        console.log(`Scraping page ${currentPage}...`);
        
        // Extract items from current page
        const items = await page.$$eval('.item', items =>
            items.map(item => ({
                title: item.querySelector('.title')?.textContent,
                price: item.querySelector('.price')?.textContent,
                link: item.querySelector('a')?.href
            }))
        );
        
        allItems.push(...items);
        
        // Check for next page
        const nextButton = await page.$('.pagination .next:not(.disabled)');
        hasNextPage = !!nextButton;
        
        if (hasNextPage) {
            await Promise.all([
                page.waitForNavigation(),
                nextButton.click()
            ]);
            currentPage++;
            
            // Be nice to the server
            await page.waitForTimeout(1000);
        }
    }
    
    await browser.close();
    return allItems;
}

Infinite Scroll

javascript

// scrapers/infinite-scroll.js
async function scrapeInfiniteScroll(url) {
    const browser = await chromium.launch();
    const page = await browser.newPage();
    
    await page.goto(url);
    
    // Scroll until no new content loads
    let previousHeight = 0;
    let currentHeight = await page.evaluate(() => document.body.scrollHeight);
    
    while (previousHeight !== currentHeight) {
        previousHeight = currentHeight;
        
        // Scroll to bottom
        await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
        
        // Wait for content to load
        await page.waitForTimeout(2000);
        
        currentHeight = await page.evaluate(() => document.body.scrollHeight);
    }
    
    // Now extract all data
    const data = await page.evaluate(() =>
        Array.from(document.querySelectorAll('.item')).map(item => ({
            title: item.querySelector('.title')?.textContent,
            content: item.querySelector('.content')?.textContent
        }))
    );
    
    await browser.close();
    return data;
}

Data Extraction Utilities

Table Extraction

javascript

// utils/table-extractor.js
async function extractTable(page, selector) {
    return await page.$eval(selector, table => {
        const headers = Array.from(table.querySelectorAll('th'))
            .map(th => th.textContent.trim());
        
        const rows = Array.from(table.querySelectorAll('tbody tr'))
            .map(row => {
                const cells = Array.from(row.querySelectorAll('td'));
                const rowData = {};
                headers.forEach((header, index) => {
                    rowData[header] = cells[index]?.textContent.trim();
                });
                return rowData;
            });
        
        return { headers, rows };
    });
}

JSON-LD Extraction

javascript

// utils/jsonld-extractor.js
async function extractJsonLd(page) {
    return await page.evaluate(() => {
        const scripts = document.querySelectorAll('script[type="application/ld+json"]');
        return Array.from(scripts).map(script => {
            try {
                return JSON.parse(script.textContent);
            } catch (e) {
                return null;
            }
        }).filter(Boolean);
    });
}

Monitoring and Alerting

Change Detection

javascript

// monitors/change-detector.js
const fs = require('fs').promises;
const crypto = require('crypto');

class ChangeDetector {
    constructor(storagePath = './.monitor-state.json') {
        this.storagePath = storagePath;
    }
    
    async hasChanged(url, content) {
        const hash = crypto.createHash('md5').update(content).digest('hex');
        
        let state = {};
        try {
            const data = await fs.readFile(this.storagePath, 'utf8');
            state = JSON.parse(data);
        } catch (e) {
            // No state file yet
        }
        
        const previousHash = state[url];
        const changed = previousHash !== hash;
        
        if (changed) {
            state[url] = hash;
            await fs.writeFile(this.storagePath, JSON.stringify(state, null, 2));
        }
        
        return changed;
    }
}

// Usage in scraper
const detector = new ChangeDetector();
const content = await page.content();
if (await detector.hasChanged(url, content)) {
    console.log('Page has changed!');
    // Send notification, etc.
}

Price Monitor

javascript

// monitors/price-monitor.js
async function monitorPrice(url, selector, threshold) {
    const browser = await chromium.launch();
    const page = await browser.newPage();
    
    await page.goto(url);
    
    const priceText = await page.locator(selector).textContent();
    const price = parseFloat(priceText.replace(/[^0-9.]/g, ''));
    
    await browser.close();
    
    if (price <= threshold) {
        return {
            alert: true,
            message: `Price dropped to $${price}! (threshold: $${threshold})`,
            url
        };
    }
    
    return { alert: false, price };
}

Screenshot Automation

Full Page Screenshots

javascript

// screenshots/full-page.js
async function captureFullPage(url, outputPath) {
    const browser = await chromium.launch();
    const page = await browser.newPage();
    
    await page.goto(url, { waitUntil: 'networkidle' });
    
    // Handle cookie banners (example)
    const cookieBanner = await page.$('.cookie-banner');
    if (cookieBanner) {
        await cookieBanner.evaluate(el => el.remove());
    }
    
    await page.screenshot({
        path: outputPath,
        fullPage: true
    });
    
    await browser.close();
    console.log(`Screenshot saved to ${outputPath}`);
}

Element Screenshots

javascript

// screenshots/element.js
async function captureElement(url, selector, outputPath) {
    const browser = await chromium.launch();
    const page = await browser.newPage();
    
    await page.goto(url);
    
    const element = await page.locator(selector);
    await element.screenshot({ path: outputPath });
    
    await browser.close();
}

Before/After Comparison

javascript

// screenshots/compare.js
const { chromium } = require('playwright');
const { expect } = require('@playwright/test');

async function compareScreenshots(url, baselinePath, outputPath) {
    const browser = await chromium.launch();
    const page = await browser.newPage();
    
    await page.goto(url);
    
    // Take new screenshot
    await page.screenshot({ path: outputPath });
    
    // Compare (requires pixelmatch or similar)
    // Implementation depends on comparison library
    
    await browser.close();
}

Scheduling and Automation

Cron-Based Scraping

bash

# Add to crontab
*/30 * * * * cd /path/to/scraper && node scrapers/monitor.js >> logs/scraper.log 2>&1

Node Scheduler

javascript

// scheduler.js
const cron = require('node-cron');

// Run every hour
cron.schedule('0 * * * *', async () => {
    console.log('Running scheduled scrape...');
    await runScraper();
});

// Run every 5 minutes during business hours
cron.schedule('*/5 9-17 * * 1-5', async () => {
    console.log('Running business hours check...');
    await checkPrices();
});

Data Storage

JSON Export

javascript

// storage/json-exporter.js
const fs = require('fs').promises;

async function exportToJson(data, filename) {
    const timestamp = new Date().toISOString().split('T')[0];
    const filepath = `./data/${filename}-${timestamp}.json`;
    
    await fs.mkdir('./data', { recursive: true });
    await fs.writeFile(filepath, JSON.stringify(data, null, 2));
    
    return filepath;
}

CSV Export

javascript

// storage/csv-exporter.js
const fs = require('fs');
const { Parser } = require('json2csv');

function exportToCsv(data, filename) {
    const parser = new Parser();
    const csv = parser.parse(data);
    
    fs.writeFileSync(`./data/${filename}.csv`, csv);
}

Database Storage

javascript

// storage/db-storage.js
const { Pool } = require('pg');

const pool = new Pool({
    connectionString: process.env.DATABASE_URL
});

async function saveToDatabase(table, data) {
    const keys = Object.keys(data);
    const values = Object.values(data);
    const placeholders = values.map((_, i) => `$${i + 1}`).join(',');
    
    const query = `
        INSERT INTO ${table} (${keys.join(',')})
        VALUES (${placeholders})
        ON CONFLICT DO NOTHING
    `;
    
    await pool.query(query, values);
}

Error Handling and Resilience

Retry Logic

javascript

// utils/retry.js
async function withRetry(fn, maxRetries = 3, delay = 1000) {
    for (let attempt = 1; attempt <= maxRetries; attempt++) {
        try {
            return await fn();
        } catch (error) {
            if (attempt === maxRetries) throw error;
            console.log(`Attempt ${attempt} failed, retrying in ${delay}ms...`);
            await new Promise(r => setTimeout(r, delay));
            delay *= 2; // Exponential backoff
        }
    }
}

// Usage
const data = await withRetry(() => scrape(url), 5);

Proxy Rotation

javascript

// utils/proxy.js
const proxies = [
    'http://proxy1:8080',
    'http://proxy2:8080',
    // ...
];

async function scrapeWithProxy(url) {
    const proxy = proxies[Math.floor(Math.random() * proxies.length)];
    
    const browser = await chromium.launch({
        proxy: { server: proxy }
    });
    
    // ... rest of scraping logic
}

Best Practices

•Respect robots.txt - Check and follow website rules
•Rate limiting - Add delays between requests
•User-Agent - Use descriptive user agent string
•Error handling - Handle network failures gracefully
•Data validation - Validate extracted data before storage
•Privacy - Don't scrape personal data without consent
•Terms of Service - Comply with website ToS

Ethical Scraping

javascript

// utils/ethical.js
const robotsParser = require('robots-parser');

async function isAllowed(url) {
    const robotsUrl = new URL('/robots.txt', url).toString();
    const response = await fetch(robotsUrl);
    const robotsTxt = await response.text();
    const robots = robotsParser(robotsUrl, robotsTxt);
    
    return robots.isAllowed(url, 'MyBot/1.0');
}

// Add respectful delays
async function respectfulDelay() {
    await new Promise(r => setTimeout(r, 1000 + Math.random() * 2000));
}