Data-Cleaner

Name: data-cleaner
Rating: 76
Author: SPIRAL-EDWIN

Systematically clean and preprocess raw data to ensure quality inputs for mathematical models.

When to Use

•After downloading or web scraping raw data
•Before any modeling or analysis
•When data has missing values, outliers, or inconsistent formats
•Need to document data quality for paper

Standard Cleaning Pipeline

Step 1: Load and Inspect

python

import pandas as pd
import numpy as np

def load_data(filepath):
    """Load data with automatic format detection"""
    
    # Detect file type
    if filepath.endswith('.csv'):
        df = pd.read_csv(filepath)
    elif filepath.endswith(('.xls', '.xlsx')):
        df = pd.read_excel(filepath)
    elif filepath.endswith('.json'):
        df = pd.read_json(filepath)
    else:
        raise ValueError(f"Unsupported file format: {filepath}")
    
    # Initial inspection
    print(f"Data shape: {df.shape}")
    print(f"\nColumn types:\n{df.dtypes}")
    print(f"\nFirst few rows:\n{df.head()}")
    print(f"\nMissing values:\n{df.isnull().sum()}")
    print(f"\nBasic statistics:\n{df.describe()}")
    
    return df

Step 2: Handle Missing Values

python

def handle_missing_values(df, strategy='auto', threshold=0.5):
    """
    Handle missing data with appropriate strategy
    
    Parameters:
    - strategy: 'auto', 'drop', 'mean', 'median', 'forward_fill', 'interpolate'
    - threshold: Drop columns if missing > threshold fraction
    """
    
    # Record initial state
    initial_shape = df.shape
    missing_report = df.isnull().sum()
    
    # Drop columns with too many missing values
    missing_fraction = df.isnull().sum() / len(df)
    cols_to_drop = missing_fraction[missing_fraction > threshold].index
    
    if len(cols_to_drop) > 0:
        print(f"Dropping columns with >{threshold*100}% missing: {list(cols_to_drop)}")
        df = df.drop(columns=cols_to_drop)
    
    # Handle remaining missing values
    for col in df.columns:
        if df[col].isnull().any():
            
            if strategy == 'auto':
                # Auto-select strategy based on data type and distribution
                if df[col].dtype in ['int64', 'float64']:
                    # Numeric: use median (robust to outliers)
                    fill_value = df[col].median()
                    df[col].fillna(fill_value, inplace=True)
                    print(f"  {col}: filled {missing_report[col]} values with median ({fill_value:.2f})")
                else:
                    # Categorical: use mode
                    fill_value = df[col].mode()[0]
                    df[col].fillna(fill_value, inplace=True)
                    print(f"  {col}: filled {missing_report[col]} values with mode ('{fill_value}')")
            
            elif strategy == 'mean':
                df[col].fillna(df[col].mean(), inplace=True)
            
            elif strategy == 'median':
                df[col].fillna(df[col].median(), inplace=True)
            
            elif strategy == 'forward_fill':
                df[col].fillna(method='ffill', inplace=True)
            
            elif strategy == 'interpolate':
                df[col].interpolate(method='linear', inplace=True)
            
            elif strategy == 'drop':
                df = df.dropna(subset=[col])
    
    print(f"\nShape after missing value handling: {initial_shape} → {df.shape}")
    
    return df

Step 3: Detect and Handle Outliers

python

def detect_outliers(df, method='iqr', threshold=3):
    """
    Detect outliers using IQR or Z-score method
    
    Parameters:
    - method: 'iqr' (Interquartile Range) or 'zscore'
    - threshold: 3 for z-score, 1.5 for IQR (standard values)
    """
    
    outlier_report = {}
    
    for col in df.select_dtypes(include=[np.number]).columns:
        
        if method == 'iqr':
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
            
        elif method == 'zscore':
            z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
            outliers = df[z_scores > threshold]
        
        if len(outliers) > 0:
            outlier_report[col] = {
                'count': len(outliers),
                'percentage': len(outliers) / len(df) * 100,
                'values': outliers[col].tolist()
            }
            print(f"  {col}: {len(outliers)} outliers ({len(outliers)/len(df)*100:.1f}%)")
    
    return outlier_report

def handle_outliers(df, method='cap', outlier_report=None):
    """
    Handle detected outliers
    
    Parameters:
    - method: 'cap' (winsorize), 'remove', or 'keep'
    """
    
    if method == 'cap':
        # Cap at 1st and 99th percentiles
        for col in df.select_dtypes(include=[np.number]).columns:
            lower = df[col].quantile(0.01)
            upper = df[col].quantile(0.99)
            df[col] = df[col].clip(lower, upper)
        print("Outliers capped at 1st/99th percentiles")
    
    elif method == 'remove':
        # Remove rows with any outliers (use cautiously!)
        initial_len = len(df)
        for col in df.select_dtypes(include=[np.number]).columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            df = df[(df[col] >= Q1 - 1.5*IQR) & (df[col] <= Q3 + 1.5*IQR)]
        print(f"Removed {initial_len - len(df)} rows with outliers")
    
    elif method == 'keep':
        print("Outliers kept (no action taken)")
    
    return df

Step 4: Normalize and Standardize

python

def normalize_data(df, method='standard', columns=None):
    """
    Normalize numeric columns
    
    Parameters:
    - method: 'standard' (z-score), 'minmax' (0-1), or 'robust'
    - columns: List of columns to normalize (None = all numeric)
    """
    
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
    
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    if method == 'standard':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'robust':
        scaler = RobustScaler()  # Robust to outliers
    
    df[columns] = scaler.fit_transform(df[columns])
    
    print(f"Normalized {len(columns)} columns using {method} scaling")
    
    return df, scaler

Step 5: Data Type Conversion

python

def fix_data_types(df):
    """Convert columns to appropriate data types"""
    
    for col in df.columns:
        
        # Try to convert string dates to datetime
        if df[col].dtype == 'object':
            try:
                df[col] = pd.to_datetime(df[col])
                print(f"  {col}: converted to datetime")
            except:
                pass
        
        # Convert object columns with few unique values to category
        if df[col].dtype == 'object':
            n_unique = df[col].nunique()
            if n_unique < 50:  # Heuristic threshold
                df[col] = df[col].astype('category')
                print(f"  {col}: converted to category ({n_unique} levels)")
    
    return df

Complete Cleaning Function

python

def clean_data(filepath, output_path='data/processed.csv', 
               missing_strategy='auto', outlier_method='cap',
               normalize=False):
    """
    Complete data cleaning pipeline
    
    Returns: cleaned DataFrame and cleaning report
    """
    
    print("="*60)
    print("DATA CLEANING PIPELINE")
    print("="*60)
    
    # Load
    print("\n[1/6] Loading data...")
    df = load_data(filepath)
    
    # Missing values
    print("\n[2/6] Handling missing values...")
    df = handle_missing_values(df, strategy=missing_strategy)
    
    # Data types
    print("\n[3/6] Fixing data types...")
    df = fix_data_types(df)
    
    # Outliers
    print("\n[4/6] Detecting outliers...")
    outlier_report = detect_outliers(df, method='iqr')
    
    print("\n[5/6] Handling outliers...")
    df = handle_outliers(df, method=outlier_method)
    
    # Normalize (optional)
    if normalize:
        print("\n[6/6] Normalizing data...")
        df, scaler = normalize_data(df, method='standard')
    else:
        print("\n[6/6] Skipping normalization")
        scaler = None
    
    # Save
    df.to_csv(output_path, index=False)
    print(f"\nCleaned data saved to: {output_path}")
    
    # Generate report
    report = {
        'original_shape': df.shape,
        'final_shape': df.shape,
        'columns': list(df.columns),
        'dtypes': df.dtypes.to_dict(),
        'missing_values': df.isnull().sum().to_dict(),
        'outliers': outlier_report,
        'summary_stats': df.describe().to_dict()
    }
    
    # Save report
    import json
    with open(output_path.replace('.csv', '_report.json'), 'w') as f:
        json.dump(report, f, indent=2, default=str)
    
    print("\nCleaning complete!")
    print("="*60)
    
    return df, report

Usage Example

python

# Clean data with default settings
df_clean, report = clean_data(
    filepath='data/raw_data.csv',
    output_path='data/processed.csv',
    missing_strategy='auto',
    outlier_method='cap',
    normalize=False
)

# Access cleaned data
print(df_clean.head())
print(df_clean.info())

Output Files

Save to data/:

•processed.csv - Cleaned dataset
•processed_report.json - Cleaning statistics and decisions
•raw_data.csv - Original data (never modify!)

Quality Checklist

Before using cleaned data:

• No missing values remain (or documented why kept)
• Outliers handled appropriately (not just deleted blindly)
• Data types are correct (dates as datetime, categories as category)
• Column names are clean (no spaces, lowercase)
• Duplicates checked and removed if appropriate
• Ranges are reasonable (no negative ages, etc.)

Common Pitfalls

•Deleting too much: Removing outliers can lose real information
•Wrong imputation: Using mean for skewed data (use median)
•No documentation: Must record what cleaning was done
•Normalizing too early: Clean first, normalize last
•Ignoring domain knowledge: Some "outliers" are real (billionaires exist!)