#!/bin/bash

# BigGrep Benchmark Comparison Script
# Compares performance between original BigGrep and BigGrep Rust implementations

set -e

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Default configuration
DATASET_DIR=""
INDEX_DIR="./benchmark_indexes"
RESULTS_DIR="./benchmark_results"
SAMPLE_SIZE=1000
PATTERN_FILE=""
NUM_RUNS=3
ORIGINAL_BGREP_PATH=""
RUST_BGREP_PATH="."
WARMUP_RUNS=1
CLEANUP=true

# Function to display usage
usage() {
    echo "BigGrep Benchmark Comparison Script"
    echo ""
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "Options:"
    echo "  -d, --dataset DIR       Directory containing files to index and search"
    echo "  -i, --index-dir DIR     Directory to store index files (default: ./benchmark_indexes)"
    echo "  -r, --results-dir DIR   Directory to store results (default: ./benchmark_results)"
    echo "  -s, --sample-size N     Number of files to sample from dataset (default: 1000)"
    echo "  -p, --patterns FILE     File containing search patterns (one per line)"
    echo "  -n, --num-runs N        Number of benchmark runs per test (default: 3)"
    echo "  -o, --original PATH     Path to original BigGrep binaries"
    echo "  -R, --rust PATH         Path to Rust BigGrep binaries (default: ./)"
    echo "  -w, --warmup N          Number of warmup runs (default: 1)"
    echo "  --no-cleanup            Don't clean up generated files"
    echo "  -h, --help              Show this help message"
    echo ""
    echo "Examples:"
    echo "  $0 -d /data/benchmark -p patterns.txt"
    echo "  $0 -d /data/ -i /tmp/indexes -r /tmp/results -n 5"
    echo ""
}

# Function to print colored output
print_status() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

print_success() {
    echo -e "${GREEN}[SUCCESS]${NC} $1"
}

print_warning() {
    echo -e "${YELLOW}[WARNING]${NC} $1"
}

print_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

# Function to create sample dataset
create_sample_dataset() {
    local source_dir=$1
    local sample_dir=$2
    local sample_size=$3
    
    print_status "Creating sample dataset of $sample_size files..."
    
    mkdir -p "$sample_dir"
    
    # Create sample file list
    find "$source_dir" -type f -printf "%T@ %p\n" | \
        sort -n | \
        tail -n +$((1 + RANDOM % 100)) | \
        head -n "$sample_size" | \
        cut -d' ' -f2- > "${sample_dir}/file_list.txt"
    
    # Copy sample files
    while IFS= read -r file; do
        if [ -f "$file" ]; then
            cp "$file" "$sample_dir/$(basename "$file")" 2>/dev/null || true
        fi
    done < "${sample_dir}/file_list.txt"
    
    print_success "Sample dataset created with $(ls -1 "$sample_dir"/*.txt 2>/dev/null | wc -l) files"
}

# Function to create default patterns
create_default_patterns() {
    local patterns_file=$1
    
    print_status "Creating default search patterns..."
    
    cat > "$patterns_file" << 'EOF'
password
api_key
secret
config
debug
error
warning
test
file
data
log
cache
temp
backup
admin
root
user
auth
token
session
EOF
    
    print_success "Default patterns created: $patterns_file"
}

# Function to time a command
time_command() {
    local cmd="$1"
    local output_file="$2"
    
    # Warmup run
    for i in $(seq 1 $WARMUP_RUNS); do
        eval "$cmd" > /dev/null 2>&1 || true
    done
    
    # Actual timing runs
    local total_time=0
    local runs=0
    
    for i in $(seq 1 $NUM_RUNS); do
        start_time=$(date +%s.%N)
        
        if eval "$cmd" > "$output_file.$i" 2>&1; then
            end_time=$(date +%s.%N)
            duration=$(echo "$end_time - $start_time" | bc -l)
            total_time=$(echo "$total_time + $duration" | bc -l)
            runs=$((runs + 1))
        else
            print_warning "Run $i failed"
        fi
    done
    
    if [ $runs -gt 0 ]; then
        average_time=$(echo "scale=3; $total_time / $runs" | bc -l)
        echo "$average_time"
    else
        echo "FAILED"
    fi
}

# Function to check if original BigGrep is available
check_original_biggrep() {
    if [ -n "$ORIGINAL_BGREP_PATH" ]; then
        if [ -x "$ORIGINAL_BGREP_PATH/bgindex.py" ] && [ -x "$ORIGINAL_BGREP_PATH/bgsearch.py" ]; then
            print_success "Original BigGrep found at $ORIGINAL_BGREP_PATH"
            return 0
        else
            print_error "Original BigGrep not found or not executable at $ORIGINAL_BGREP_PATH"
            return 1
        fi
    else
        # Check common locations
        if [ -x "./bgindex.py" ] && [ -x "./bgsearch.py" ]; then
            ORIGINAL_BGREP_PATH="."
            print_success "Original BigGrep found in current directory"
            return 0
        elif [ -x "/usr/local/bin/bgindex.py" ]; then
            ORIGINAL_BGREP_PATH="/usr/local/bin"
            print_success "Original BigGrep found at $ORIGINAL_BGREP_PATH"
            return 0
        else
            print_warning "Original BigGrep not found - will skip comparison"
            return 1
        fi
    fi
}

# Function to check if Rust BigGrep is available
check_rust_biggrep() {
    if [ -x "$RUST_BGREP_PATH/rs-bgindex" ] && [ -x "$RUST_BGREP_PATH/rs-bgsearch" ]; then
        print_success "Rust BigGrep found at $RUST_BGREP_PATH"
        return 0
    else
        print_error "Rust BigGrep not found or not executable at $RUST_BGREP_PATH"
        print_error "Please build the project first: cargo build --release"
        return 1
    fi
}

# Function to run index benchmark
run_index_benchmark() {
    local sample_dir=$1
    local results_file=$2
    
    print_status "Running index building benchmarks..."
    
    # Prepare file list
    local file_list="${sample_dir}/file_list.txt"
    find "$sample_dir" -type f -exec echo {} \; > "$file_list"
    
    echo "Implementation,Average Time (s),Min Time (s),Max Time (s),Index Size (MB)" > "$results_file"
    
    # Benchmark original BigGrep
    if check_original_biggrep; then
        print_status "Benchmarking original BigGrep index building..."
        
        local output_file="${RESULTS_DIR}/original_index.log"
        local index_file="${INDEX_DIR}/original_index.bgi"
        
        mkdir -p "$INDEX_DIR"
        rm -f "$index_file"
        
        local avg_time=$(time_command "$ORIGINAL_BGREP_PATH/bgindex.py -p ${index_file%.bgi} < $file_list" "$output_file")
        
        if [ "$avg_time" != "FAILED" ]; then
            local min_time=$(sort -n "${output_file}".* | head -1 | awk '{print $1}')
            local max_time=$(sort -n "${output_file}".* | tail -1 | awk '{print $1}')
            local size=$(du -sm "$index_file" | cut -f1)
            
            echo "Original BigGrep,$avg_time,$min_time,$max_time,$size" >> "$results_file"
            print_success "Original BigGrep: ${avg_time}s (size: ${size}MB)"
        else
            print_error "Original BigGrep index building failed"
            echo "Original BigGrep,FAILED,FAILED,FAILED,N/A" >> "$results_file"
        fi
    fi
    
    # Benchmark Rust BigGrep
    if check_rust_biggrep; then
        print_status "Benchmarking Rust BigGrep index building..."
        
        local output_file="${RESULTS_DIR}/rust_index.log"
        local index_file="${INDEX_DIR}/rust_index.bgi"
        
        mkdir -p "$INDEX_DIR"
        rm -f "$index_file"
        
        local avg_time=$(time_command "$RUST_BGREP_PATH/rs-bgindex -p ${index_file%.bgi} < $file_list" "$output_file")
        
        if [ "$avg_time" != "FAILED" ]; then
            local min_time=$(sort -n "${output_file}".* | head -1 | awk '{print $1}')
            local max_time=$(sort -n "${output_file}".* | tail -1 | awk '{print $1}')
            local size=$(du -sm "$index_file" | cut -f1)
            
            echo "Rust BigGrep,$avg_time,$min_time,$max_time,$size" >> "$results_file"
            print_success "Rust BigGrep: ${avg_time}s (size: ${size}MB)"
        else
            print_error "Rust BigGrep index building failed"
            echo "Rust BigGrep,FAILED,FAILED,FAILED,N/A" >> "$results_file"
        fi
    fi
}

# Function to run search benchmark
run_search_benchmark() {
    local patterns_file=$1
    local results_file=$2
    
    print_status "Running search benchmarks..."
    
    echo "Pattern,Implementation,Average Time (s),Results Count" > "$results_file"
    
    # Read patterns
    local patterns=()
    while IFS= read -r pattern; do
        patterns+=("$pattern")
    done < "$patterns_file"
    
    # Sample a subset of patterns for benchmarking
    local sample_patterns=()
    for i in $(seq 1 $((NUM_RUNS * 2))); do
        idx=$((RANDOM % ${#patterns[@]}))
        sample_patterns+=("${patterns[$idx]}")
    done
    
    # Remove duplicates
    mapfile -t unique_patterns < <(printf '%s\n' "${sample_patterns[@]}" | sort -u)
    
    print_status "Benchmarking ${#unique_patterns[@]} unique patterns..."
    
    # Benchmark original BigGrep
    if check_original_biggrep; then
        local index_dir="${INDEX_DIR}/original_index.bgi"
        if [ -f "$index_dir" ]; then
            for pattern in "${unique_patterns[@]}"; do
                print_status "Original BigGrep search: '$pattern'"
                
                local output_file="${RESULTS_DIR}/original_search_${pattern//[^a-zA-Z0-9]/_}.log"
                local avg_time=$(time_command "$ORIGINAL_BGREP_PATH/bgsearch.py -d ${INDEX_DIR} -s '$pattern'" "$output_file")
                
                if [ "$avg_time" != "FAILED" ]; then
                    local result_count=$(cat "${output_file}".* | grep -v "^[INFO]" | grep -v "^[WARNING]" | wc -l)
                    echo "'$pattern',Original BigGrep,$avg_time,$result_count" >> "$results_file"
                    print_success "  Results: $result_count in ${avg_time}s"
                else
                    echo "'$pattern',Original BigGrep,FAILED,0" >> "$results_file"
                fi
            done
        else
            print_warning "Original BigGrep index not found - skipping original search benchmark"
        fi
    fi
    
    # Benchmark Rust BigGrep
    if check_rust_biggrep; then
        local index_dir="${INDEX_DIR}/rust_index.bgi"
        if [ -f "$index_dir" ]; then
            for pattern in "${unique_patterns[@]}"; do
                print_status "Rust BigGrep search: '$pattern'"
                
                local output_file="${RESULTS_DIR}/rust_search_${pattern//[^a-zA-Z0-9]/_}.log"
                local avg_time=$(time_command "$RUST_BGREP_PATH/rs-bgsearch -d $INDEX_DIR -a '$pattern'" "$output_file")
                
                if [ "$avg_time" != "FAILED" ]; then
                    local result_count=$(grep -v "^[INFO]" "${output_file}".* | grep -v "^[WARNING]" | grep -v "file_path" | wc -l)
                    echo "'$pattern',Rust BigGrep,$avg_time,$result_count" >> "$results_file"
                    print_success "  Results: $result_count in ${avg_time}s"
                else
                    echo "'$pattern',Rust BigGrep,FAILED,0" >> "$results_file"
                fi
            done
        else
            print_warning "Rust BigGrep index not found - skipping Rust search benchmark"
        fi
    fi
}

# Function to run memory benchmark
run_memory_benchmark() {
    local sample_dir=$1
    local results_file=$2
    
    print_status "Running memory usage benchmark..."
    
    echo "Implementation,Peak Memory (MB),Average Memory (MB)" > "$results_file"
    
    # Benchmark original BigGrep
    if check_original_biggrep; then
        local file_list="${sample_dir}/file_list.txt"
        local index_file="${INDEX_DIR}/mem_original.bgi"
        
        print_status "Measuring original BigGrep memory usage..."
        
        # Run with memory monitoring
        local peak_memory=$( { /usr/bin/time -v $ORIGINAL_BGREP_PATH/bgindex.py -p ${index_file%.bgi} < $file_list 2>&1 ; } | grep "Maximum resident set size" | awk '{print int($6/1024)}' || echo "N/A")
        local avg_memory=$( { /usr/bin/time -v $ORIGINAL_BGREP_PATH/bgsearch.py -d ${INDEX_DIR} -s "test" 2>&1 ; } | grep "Maximum resident set size" | awk '{print int($6/1024)}' || echo "N/A")
        
        echo "Original BigGrep,$peak_memory,$avg_memory" >> "$results_file"
        print_success "Original BigGrep: Peak: ${peak_memory}MB, Avg: ${avg_memory}MB"
    fi
    
    # Benchmark Rust BigGrep
    if check_rust_biggrep; then
        local file_list="${sample_dir}/file_list.txt"
        local index_file="${INDEX_DIR}/mem_rust.bgi"
        
        print_status "Measuring Rust BigGrep memory usage..."
        
        # Run with memory monitoring
        local peak_memory=$( { /usr/bin/time -v $RUST_BGREP_PATH/rs-bgindex -p ${index_file%.bgi} < $file_list 2>&1 ; } | grep "Maximum resident set size" | awk '{print int($6/1024)}' || echo "N/A")
        local avg_memory=$( { /usr/bin/time -v $RUST_BGREP_PATH/rs-bgsearch -d $INDEX_DIR -a "test" 2>&1 ; } | grep "Maximum resident set size" | awk '{print int($6/1024)}' || echo "N/A")
        
        echo "Rust BigGrep,$peak_memory,$avg_memory" >> "$results_file"
        print_success "Rust BigGrep: Peak: ${peak_memory}MB, Avg: ${avg_memory}MB"
    fi
}

# Function to generate results summary
generate_summary() {
    local index_results=$1
    local search_results=$2
    local memory_results=$3
    local summary_file=$4
    
    print_status "Generating results summary..."
    
    cat > "$summary_file" << 'EOF'
# BigGrep Performance Benchmark Summary

## Test Configuration

EOF
    
    echo "- Dataset: $DATASET_DIR" >> "$summary_file"
    echo "- Sample Size: $SAMPLE_SIZE files" >> "$summary_file"
    echo "- Number of Runs: $NUM_RUNS" >> "$summary_file"
    echo "- Warmup Runs: $WARMUP_RUNS" >> "$summary_file"
    
    cat >> "$summary_file" << 'EOF'

## Index Building Performance

EOF
    
    if [ -f "$index_results" ]; then
        cat "$index_results" | column -t -s ',' >> "$summary_file"
    else
        echo "No index building results available" >> "$summary_file"
    fi
    
    cat >> "$summary_file" << 'EOF'

## Search Performance

EOF
    
    if [ -f "$search_results" ]; then
        # Group by pattern
        local patterns=($(tail -n +2 "$search_results" | cut -d',' -f1 | sort -u))
        for pattern in "${patterns[@]}"; do
            echo "" >> "$summary_file"
            echo "### Pattern: $pattern" >> "$summary_file"
            grep "'$pattern'" "$search_results" | column -t -s ',' >> "$summary_file"
        done
    else
        echo "No search results available" >> "$summary_file"
    fi
    
    cat >> "$summary_file" << 'EOF'

## Memory Usage

EOF
    
    if [ -f "$memory_results" ]; then
        cat "$memory_results" | column -t -s ',' >> "$summary_file"
    else
        echo "No memory usage results available" >> "$summary_file"
    fi
    
    cat >> "$summary_file" << 'EOF'

## Analysis

### Performance Comparison

EOF
    
    # Add performance analysis
    if [ -f "$index_results" ]; then
        echo "Index building performance analysis:" >> "$summary_file"
        # Add analysis logic here
        
        if [ -f "$search_results" ]; then
            echo "Search performance analysis:" >> "$summary_file"
            # Add analysis logic here
        fi
    fi
    
    print_success "Summary report generated: $summary_file"
}

# Function to clean up
cleanup() {
    if [ "$CLEANUP" = true ]; then
        print_status "Cleaning up temporary files..."
        rm -rf "$INDEX_DIR"
        rm -f "${RESULTS_DIR}/*.log".*
        rm -f "${RESULTS_DIR}/*.log"
        rm -f "${sample_dir}/file_list.txt"
    fi
}

# Main function
main() {
    print_status "Starting BigGrep benchmark comparison..."
    
    # Create output directories
    mkdir -p "$INDEX_DIR" "$RESULTS_DIR"
    
    # Create sample dataset
    local sample_dir="${RESULTS_DIR}/sample_dataset"
    create_sample_dataset "$DATASET_DIR" "$sample_dir" "$SAMPLE_SIZE"
    
    # Create default patterns if not provided
    local patterns_file="$RESULTS_DIR/patterns.txt"
    if [ -z "$PATTERN_FILE" ]; then
        create_default_patterns "$patterns_file"
    else
        PATTERN_FILE="$PATTERN_FILE"
    fi
    
    # Run benchmarks
    local index_results="${RESULTS_DIR}/index_benchmark.csv"
    local search_results="${RESULTS_DIR}/search_benchmark.csv"
    local memory_results="${RESULTS_DIR}/memory_benchmark.csv"
    local summary_file="${RESULTS_DIR}/benchmark_summary.md"
    
    run_index_benchmark "$sample_dir" "$index_results"
    run_search_benchmark "$patterns_file" "$search_results"
    run_memory_benchmark "$sample_dir" "$memory_results"
    
    # Generate summary
    generate_summary "$index_results" "$search_results" "$memory_results" "$summary_file"
    
    # Display results
    print_success "Benchmark completed!"
    print_status "Results saved to: $RESULTS_DIR"
    print_status "Summary report: $summary_file"
    
    # Display quick summary
    if [ -f "$summary_file" ]; then
        echo ""
        print_status "Quick Summary:"
        head -20 "$summary_file" | grep -E "^-|^[A-Z]|^[0-9]" || true
    fi
}

# Parse command-line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        -d|--dataset)
            DATASET_DIR="$2"
            shift 2
            ;;
        -i|--index-dir)
            INDEX_DIR="$2"
            shift 2
            ;;
        -r|--results-dir)
            RESULTS_DIR="$2"
            shift 2
            ;;
        -s|--sample-size)
            SAMPLE_SIZE="$2"
            shift 2
            ;;
        -p|--patterns)
            PATTERN_FILE="$2"
            shift 2
            ;;
        -n|--num-runs)
            NUM_RUNS="$2"
            shift 2
            ;;
        -o|--original)
            ORIGINAL_BGREP_PATH="$2"
            shift 2
            ;;
        -R|--rust)
            RUST_BGREP_PATH="$2"
            shift 2
            ;;
        -w|--warmup)
            WARMUP_RUNS="$2"
            shift 2
            ;;
        --no-cleanup)
            CLEANUP=false
            shift
            ;;
        -h|--help)
            usage
            exit 0
            ;;
        *)
            print_error "Unknown option: $1"
            usage
            exit 1
            ;;
    esac
done

# Validate required arguments
if [ -z "$DATASET_DIR" ]; then
    print_error "Dataset directory is required"
    usage
    exit 1
fi

if [ ! -d "$DATASET_DIR" ]; then
    print_error "Dataset directory not found: $DATASET_DIR"
    exit 1
fi

# Check for required tools
if ! command -v bc >/dev/null 2>&1; then
    print_error "bc (calculator) is required but not installed"
    exit 1
fi

# Set up cleanup trap
trap cleanup EXIT

# Run main function
main