//! Parallel processing utilities for BigGrep operations
//! 
//! Provides thread pool management and parallel processing utilities
//! for high-performance N-gram processing and search operations.

use crate::error::{BigGrepError, BigGrepResult};
use rayon::prelude::*;
use std::sync::{Arc, Mutex};
use std::thread;
use std::time::{Duration, Instant};
use crossbeam_channel::{unbounded, Sender, Receiver};
use rayon::ThreadPool;

/// Thread pool for parallel operations
#[derive(Debug)]
pub struct ThreadPool {
    pool: rayon::ThreadPool,
    config: PoolConfig,
}

impl ThreadPool {
    /// Create a new thread pool with default configuration
    pub fn new() -> BigGrepResult<Self> {
        let config = PoolConfig::default();
        Self::with_config(config)
    }
    
    /// Create thread pool with custom configuration
    pub fn with_config(config: PoolConfig) -> BigGrepResult<Self> {
        let pool = rayon::ThreadPoolBuilder::new()
            .num_threads(config.num_threads)
            .thread_name(|i| format!("biggrep-worker-{}", i))
            .build()
            .map_err(|e| BigGrepError::Internal(format!("Failed to create thread pool: {}", e)))?;
        
        Ok(Self { pool, config })
    }
    
    /// Execute closure in parallel across thread pool
    pub fn install<F, R>(&self, f: F) -> R
    where
        F: FnOnce() -> R + Send,
        R: Send,
    {
        self.pool.install(f)
    }
    
    /// Execute parallel iterator operation
    pub fn par_iter<T, I, F, R>(&self, iter: I, f: F) -> BigGrepResult<Vec<R>>
    where
        T: Send,
        I: IntoParallelIterator<Item = T>,
        F: Fn(T) -> R + Send + Sync,
        R: Send,
    {
        let start = Instant::now();
        let results: Vec<R> = self.install(|| {
            iter.into_par_iter()
                .map(f)
                .collect()
        });
        
        if self.config.log_performance {
            let elapsed = start.elapsed();
            log::info!("Parallel operation completed in {:.2}s", elapsed.as_secs_f64());
        }
        
        Ok(results)
    }
    
    /// Execute parallel chunks operation
    pub fn par_chunks<T, I, F, R>(&self, data: &[T], chunk_size: usize, f: F) -> BigGrepResult<Vec<R>>
    where
        T: Send + Clone,
        I: IntoParallelIterator,
        F: Fn(&[T]) -> R + Send + Sync,
        R: Send,
    {
        let start = Instant::now();
        let results: Vec<R> = self.install(|| {
            data.par_chunks(chunk_size)
                .map(f)
                .collect()
        });
        
        if self.config.log_performance {
            let elapsed = start.elapsed();
            log::info!("Chunked parallel operation completed in {:.2}s", elapsed.as_secs_f64());
        }
        
        Ok(results)
    }
    
    /// Execute parallel fold operation with accumulator
    pub fn par_fold<T, I, F, A, R>(&self, iter: I, initial: A, f: F) -> BigGrepResult<A>
    where
        T: Send,
        I: IntoParallelIterator<Item = T>,
        F: Fn(A, T) -> A + Send + Sync,
        A: Send + Default,
    {
        let start = Instant::now();
        let result: A = self.install(|| {
            iter.into_par_iter()
                .fold(A::default, f)
                .reduce(A::default, f)
        });
        
        if self.config.log_performance {
            let elapsed = start.elapsed();
            log::info!("Parallel fold completed in {:.2}s", elapsed.as_secs_f64());
        }
        
        Ok(result)
    }
    
    /// Get thread pool statistics
    pub fn stats(&self) -> PoolStats {
        PoolStats {
            num_threads: self.config.num_threads,
            current_threads: rayon::current_num_threads(),
        }
    }
}

/// Thread pool configuration
#[derive(Debug, Clone)]
pub struct PoolConfig {
    pub num_threads: usize,
    pub log_performance: bool,
    pub enable_work_stealing: bool,
    pub stack_size: Option<usize>,
}

impl Default for PoolConfig {
    fn default() -> Self {
        Self {
            num_threads: rayon::current_num_threads(),
            log_performance: false,
            enable_work_stealing: true,
            stack_size: None,
        }
    }
}

/// Thread pool statistics
#[derive(Debug, Clone)]
pub struct PoolStats {
    pub num_threads: usize,
    pub current_threads: usize,
}

/// Parallel processor for batch operations
#[derive(Debug)]
pub struct ParallelProcessor {
    thread_pool: ThreadPool,
    batch_size: usize,
}

impl ParallelProcessor {
    /// Create new parallel processor
    pub fn new(num_threads: usize) -> BigGrepResult<Self> {
        let config = PoolConfig {
            num_threads,
            ..Default::default()
        };
        let thread_pool = ThreadPool::with_config(config)?;
        
        Ok(Self {
            thread_pool,
            batch_size: 1000,
        })
    }
    
    /// Set batch size for chunked operations
    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
        self.batch_size = batch_size.max(1);
        self
    }
    
    /// Process items in parallel with callback
    pub fn process_items<T, F, R>(&self, items: &[T], processor: F) -> BigGrepResult<Vec<R>>
    where
        T: Send + Clone,
        F: Fn(usize, &T) -> R + Send + Sync,
        R: Send,
    {
        let indexed_items: Vec<(usize, &T)> = items.iter().enumerate().collect();
        
        self.thread_pool.par_iter(indexed_items, |(idx, item)| {
            processor(*idx, *item)
        })
    }
    
    /// Process items in parallel batches
    pub fn process_batches<T, F, R>(&self, items: &[T], batch_processor: F) -> BigGrepResult<Vec<R>>
    where
        T: Send + Clone,
        F: Fn(&[T]) -> R + Send + Sync,
        R: Send,
    {
        self.thread_pool.par_chunks(items, self.batch_size, batch_processor)
    }
    
    /// Process files in parallel
    pub fn process_files<F, R>(&self, files: &[std::path::PathBuf], processor: F) -> BigGrepResult<Vec<R>>
    where
        F: Fn(&std::path::Path) -> BigGrepResult<R> + Send + Sync,
        R: Send,
    {
        self.thread_pool.par_iter(files, |file_path| {
            processor(file_path).unwrap_or_else(|e| {
                log::warn!("Failed to process file {:?}: {}", file_path, e);
                panic!("Processing failed")
            })
        })
    }
    
    /// Aggregate results from parallel operations
    pub fn aggregate_results<T, F, A, R>(&self, items: &[T], aggregator: F) -> BigGrepResult<A>
    where
        T: Send,
        F: Fn(A, T) -> A + Send + Sync,
        A: Send + Default + Clone,
        R: IntoIterator<Item = T>,
    {
        let results: Vec<R> = self.process_items(items, |_, item| {
            vec![item.clone()]
        })?;
        
        let flattened: Vec<T> = results.into_iter().flatten().collect();
        
        self.thread_pool.par_fold(flattened, A::default(), aggregator)
    }
}

/// Work-stealing queue for load balancing
#[derive(Debug)]
pub struct WorkQueue<T> {
    queue: Arc<Mutex<Vec<T>>>,
    workers: usize,
}

impl<T> WorkQueue<T> {
    /// Create new work queue
    pub fn new(capacity: usize, workers: usize) -> Self {
        Self {
            queue: Arc::new(Mutex::new(Vec::with_capacity(capacity))),
            workers,
        }
    }
    
    /// Push work item to queue
    pub fn push(&self, item: T) {
        let mut queue = self.queue.lock().unwrap();
        queue.push(item);
    }
    
    /// Pop work item from queue
    pub fn pop(&self) -> Option<T> {
        let mut queue = self.queue.lock().unwrap();
        queue.pop()
    }
    
    /// Get queue length
    pub fn len(&self) -> usize {
        self.queue.lock().unwrap().len()
    }
    
    /// Check if queue is empty
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }
}

/// Producer-consumer channel for parallel pipelines
#[derive(Debug)]
pub struct ParallelPipeline<T> {
    sender: Sender<T>,
    receiver: Receiver<T>,
}

impl<T> ParallelPipeline<T> {
    /// Create new parallel pipeline
    pub fn new(buffer_size: usize) -> Self {
        let (sender, receiver) = unbounded();
        Self { sender, receiver }
    }
    
    /// Get sender end of pipeline
    pub fn sender(&self) -> &Sender<T> {
        &self.sender
    }
    
    /// Get receiver end of pipeline
    pub fn receiver(&self) -> &Receiver<T> {
        &self.receiver
    }
    
    /// Process pipeline with multiple stages
    pub fn process_stages<F1, F2, R>(&self, stage1: F1, stage2: F2) -> BigGrepResult<Vec<R>>
    where
        T: Send,
        F1: Fn(T) -> R + Send + Sync,
        F2: Fn(R) -> BigGrepResult<()> + Send + Sync,
        R: Send,
    {
        let results = Arc::new(Mutex::new(Vec::new()));
        
        // Stage 1: Transform
        let transformed: Vec<R> = self.receiver
            .iter()
            .par_bridge()
            .map(|item| stage1(item))
            .collect();
        
        // Stage 2: Process results
        for result in &transformed {
            if let Err(e) = stage2(result.clone()) {
                return Err(BigGrepError::Internal(format!("Pipeline stage 2 failed: {}", e)));
            }
        }
        
        Ok(transformed)
    }
}

/// Performance monitor for parallel operations
#[derive(Debug)]
pub struct PerformanceMonitor {
    start_times: Arc<Mutex<Vec<Instant>>>,
    operation_counts: Arc<Mutex<std::collections::HashMap<String, usize>>>,
    total_times: Arc<Mutex<std::collections::HashMap<String, Duration>>>,
}

impl PerformanceMonitor {
    /// Create new performance monitor
    pub fn new() -> Self {
        Self {
            start_times: Arc::new(Mutex::new(Vec::new())),
            operation_counts: Arc::new(Mutex::new(std::collections::HashMap::new())),
            total_times: Arc::new(Mutex::new(std::collections::HashMap::new())),
        }
    }
    
    /// Start timing an operation
    pub fn start_operation(&self) -> usize {
        let mut times = self.start_times.lock().unwrap();
        let start = Instant::now();
        times.push(start);
        times.len() - 1
    }
    
    /// End timing an operation
    pub fn end_operation(&self, operation_id: usize, operation_name: &str) {
        if let Some(start_time) = {
            let mut times = self.start_times.lock().unwrap();
            times.get_mut(operation_id).cloned()
        } {
            let elapsed = start_time.elapsed();
            
            let mut counts = self.operation_counts.lock().unwrap();
            let count = counts.entry(operation_name.to_string()).or_insert(0);
            *count += 1;
            
            let mut times = self.total_times.lock().unwrap();
            let total = times.entry(operation_name.to_string()).or_insert(Duration::from_secs(0));
            *total += elapsed;
        }
    }
    
    /// Get performance statistics
    pub fn get_stats(&self) -> PerformanceStats {
        let counts = self.operation_counts.lock().unwrap().clone();
        let total_times = self.total_times.lock().unwrap().clone();
        
        let mut stats = PerformanceStats::new();
        
        for (name, count) in counts {
            if let Some(total_time) = total_times.get(&name) {
                let avg_time = *total_time / count as u32;
                stats.add_operation(name, count, *total_time, avg_time);
            }
        }
        
        stats
    }
    
    /// Reset all statistics
    pub fn reset(&self) {
        *self.start_times.lock().unwrap() = Vec::new();
        self.operation_counts.lock().unwrap().clear();
        self.total_times.lock().unwrap().clear();
    }
}

/// Performance statistics
#[derive(Debug, Clone)]
pub struct PerformanceStats {
    operations: std::collections::HashMap<String, OperationStats>,
}

impl PerformanceStats {
    fn new() -> Self {
        Self {
            operations: std::collections::HashMap::new(),
        }
    }
    
    fn add_operation(&mut self, name: String, count: usize, total_time: Duration, avg_time: Duration) {
        self.operations.insert(name, OperationStats {
            count,
            total_time,
            avg_time,
        });
    }
    
    /// Get total operation count
    pub fn total_operations(&self) -> usize {
        self.operations.values().map(|stats| stats.count).sum()
    }
    
    /// Get total time across all operations
    pub fn total_time(&self) -> Duration {
        self.operations.values().map(|stats| stats.total_time).sum()
    }
    
    /// Get average time per operation
    pub fn avg_time_per_operation(&self) -> Duration {
        let total_ops = self.total_operations();
        if total_ops > 0 {
            self.total_time() / total_ops as u32
        } else {
            Duration::from_secs(0)
        }
    }
}

/// Individual operation statistics
#[derive(Debug, Clone)]
pub struct OperationStats {
    pub count: usize,
    pub total_time: Duration,
    pub avg_time: Duration,
}

/// Parallel iterator extensions
pub trait ParallelIteratorExt<T> {
    fn parallel_process<F, R>(&self, processor: F) -> BigGrepResult<Vec<R>>
    where
        F: Fn(&T) -> BigGrepResult<R> + Send + Sync,
        R: Send;
    
    fn parallel_filter_map<F, R>(&self, filter_mapper: F) -> BigGrepResult<Vec<R>>
    where
        F: Fn(&T) -> Option<R> + Send + Sync,
        R: Send;
}

impl<T> ParallelIteratorExt<T> for Vec<T> {
    fn parallel_process<F, R>(&self, processor: F) -> BigGrepResult<Vec<R>>
    where
        F: Fn(&T) -> BigGrepResult<R> + Send + Sync,
        R: Send,
    {
        let results: Vec<R> = self
            .par_iter()
            .map(|item| processor(item).unwrap_or_else(|e| {
                log::warn!("Processing failed: {}", e);
                panic!("Processing failed")
            }))
            .collect();
        
        Ok(results)
    }
    
    fn parallel_filter_map<F, R>(&self, filter_mapper: F) -> BigGrepResult<Vec<R>>
    where
        F: Fn(&T) -> Option<R> + Send + Sync,
        R: Send,
    {
        let results: Vec<R> = self
            .par_iter()
            .filter_map(|item| filter_mapper(item))
            .collect();
        
        Ok(results)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::NamedTempFile;
    
    #[test]
    fn test_thread_pool_creation() {
        let pool = ThreadPool::new().unwrap();
        let stats = pool.stats();
        assert!(stats.num_threads > 0);
        assert!(stats.current_threads > 0);
    }
    
    #[test]
    fn test_parallel_processor() {
        let processor = ParallelProcessor::new(2).unwrap();
        
        let items = vec![1, 2, 3, 4, 5];
        let results = processor.process_items(&items, |_, &x| x * 2).unwrap();
        
        assert_eq!(results, vec![2, 4, 6, 8, 10]);
    }
    
    #[test]
    fn test_work_queue() {
        let queue = WorkQueue::new(10, 2);
        
        queue.push(1);
        queue.push(2);
        queue.push(3);
        
        assert_eq!(queue.len(), 3);
        
        assert_eq!(queue.pop(), Some(3));
        assert_eq!(queue.pop(), Some(2));
        assert_eq!(queue.pop(), Some(1));
        assert_eq!(queue.pop(), None);
    }
    
    #[test]
    fn test_performance_monitor() {
        let monitor = PerformanceMonitor::new();
        
        let op_id = monitor.start_operation();
        thread::sleep(Duration::from_millis(10));
        monitor.end_operation(op_id, "test_operation");
        
        let stats = monitor.get_stats();
        assert!(stats.total_operations() > 0);
        assert!(stats.total_time() > Duration::from_millis(0));
    }
    
    #[test]
    fn test_parallel_iterator_ext() {
        let items = vec![1, 2, 3, 4, 5];
        
        let results = items.parallel_process(|&x| Ok(x * 2)).unwrap();
        assert_eq!(results, vec![2, 4, 6, 8, 10]);
        
        let filtered: Vec<i32> = items.parallel_filter_map(|&x| {
            if x % 2 == 0 { Some(x) } else { None }
        }).unwrap();
        assert_eq!(filtered, vec![2, 4]);
    }
}
