//! Verification module with Boyer-Moore-Horspool pattern matching
//! 
//! Implements verification algorithms including Boyer-Moore-Horspool for
//! pattern verification and integrity checks for indexes and metadata.

use crate::error::{BigGrepError, BigGrepResult};
use crate::index::{IndexReader, EFTrie};
use crate::ngram::{NgramCount, Token};
use regex::Regex;
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
use std::path::{Path, PathBuf};
use memmap2::Mmap;

/// Boyer-Moore-Horspool pattern matching implementation
#[derive(Debug)]
pub struct BoyerMooreHorspool {
    pattern: Vec<u8>,
    bad_char_table: [usize; 256],
}

impl BoyerMooreHorspool {
    /// Create a new Boyer-Moore-Horspool matcher for the given pattern
    pub fn new(pattern: &str) -> Self {
        let pattern_bytes = pattern.as_bytes().to_vec();
        let bad_char_table = Self::build_bad_char_table(&pattern_bytes);
        Self {
            pattern: pattern_bytes,
            bad_char_table,
        }
    }
    
    /// Build the bad character table for Boyer-Moore-Horspool
    fn build_bad_char_table(pattern: &[u8]) -> [usize; 256] {
        let mut table = [0; 256];
        
        for (i, &c) in pattern.iter().enumerate() {
            table[c as usize] = i;
        }
        
        table
    }
    
    /// Search for the pattern in the given text
    /// Returns the byte offset of the first match, or None if not found
    pub fn search(&self, text: &[u8]) -> Option<usize> {
        let n = text.len();
        let m = self.pattern.len();
        
        if m == 0 || m > n {
            return None;
        }
        
        let mut i = 0;
        
        while i <= n - m {
            let mut j = m - 1;
            
            // Check pattern from right to left
            while j < m && self.pattern[j] == text[i + j] {
                if j == 0 {
                    return Some(i); // Found match
                }
                j -= 1;
            }
            
            // Calculate shift based on bad character rule
            let bad_char = if i + m < n {
                text[i + m]
            } else {
                b'\0'
            };
            
            let shift = self.bad_char_table[bad_char as usize].saturating_sub(m - 1).max(1);
            i += shift;
        }
        
        None
    }
    
    /// Search for all occurrences of the pattern
    pub fn find_all(&self, text: &[u8]) -> Vec<usize> {
        let mut matches = Vec::new();
        let n = text.len();
        let m = self.pattern.len();
        
        if m == 0 || m > n {
            return matches;
        }
        
        let mut i = 0;
        
        while i <= n - m {
            let mut j = m - 1;
            
            while j < m && self.pattern[j] == text[i + j] {
                if j == 0 {
                    matches.push(i); // Found match
                    break;
                }
                j -= 1;
            }
            
            // Calculate shift based on bad character rule
            let bad_char = if i + m < n {
                text[i + m]
            } else {
                b'\0'
            };
            
            let shift = self.bad_char_table[bad_char as usize].saturating_sub(m - 1).max(1);
            i += shift;
        }
        
        matches
    }
    
    /// Get the pattern being searched for
    pub fn pattern(&self) -> &[u8] {
        &self.pattern
    }
}

/// Boyer-Moore-Horspool verifier for text files
#[derive(Debug)]
pub struct TextVerifier {
    matcher: BoyerMooreHorspool,
}

impl TextVerifier {
    /// Create a new text verifier for the given pattern
    pub fn new(pattern: &str) -> Self {
        Self {
            matcher: BoyerMooreHorspool::new(pattern),
        }
    }
    
    /// Verify pattern occurrence in a file using memory-mapped I/O
    pub fn verify_file(&self, file_path: &Path) -> BigGrepResult<VerificationResult> {
        let file = File::open(file_path)
            .map_err(|e| BigGrepError::Io(e))?;
        
        let mmap = unsafe { Mmap::map(&file) }
            .map_err(|e| BigGrepError::MemoryMap(e.to_string()))?;
        
        let matches = self.matcher.find_all(&mmap);
        
        let result = VerificationResult::new(
            file_path.to_path_buf(),
            matches.len() as u64,
            matches.into_iter().map(|offset| offset as u64).collect(),
        );
        
        Ok(result)
    }
    
    /// Verify pattern in a text slice
    pub fn verify_text(&self, text: &str) -> VerificationResult {
        let matches = self.matcher.find_all(text.as_bytes());
        let file_path = PathBuf::from("<text>");
        
        VerificationResult::new(
            file_path,
            matches.len() as u64,
            matches.into_iter().map(|offset| offset as u64).collect(),
        )
    }
}

/// Verification result containing match information
#[derive(Debug, Clone)]
pub struct VerificationResult {
    pub file_path: PathBuf,
    pub match_count: u64,
    pub match_offsets: Vec<u64>,
    pub verified: bool,
    pub errors: Vec<String>,
}

impl VerificationResult {
    pub fn new(file_path: PathBuf, match_count: u64, match_offsets: Vec<u64>) -> Self {
        Self {
            file_path,
            match_count,
            match_offsets,
            verified: true,
            errors: Vec::new(),
        }
    }
    
    pub fn with_error(mut self, error: String) -> Self {
        self.errors.push(error);
        self.verified = false;
        self
    }
}

/// Verification options for controlling verification behavior
#[derive(Debug, Clone)]
pub struct VerificationOptions {
    pub check_binary_files: bool,
    pub verify_counts: bool,
    pub verify_structure: bool,
    pub max_file_size: u64,
    pub pattern: Option<String>,
}

impl Default for VerificationOptions {
    fn default() -> Self {
        Self {
            check_binary_files: false,
            verify_counts: true,
            verify_structure: true,
            max_file_size: 100 * 1024 * 1024, // 100MB
            pattern: None,
        }
    }
}

/// Main verification engine for index integrity and pattern verification
#[derive(Debug)]
pub struct VerificationEngine {
    options: VerificationOptions,
}

impl VerificationEngine {
    pub fn new(options: VerificationOptions) -> Self {
        Self { options }
    }
    
    /// Verify index integrity and structure
    pub fn verify_index(&self, index_path: &Path) -> BigGrepResult<IndexVerificationReport> {
        let index_reader = IndexReader::load(index_path)
            .map_err(|e| BigGrepError::Verification(e.to_string()))?;
        
        let mut report = IndexVerificationReport::new(index_path.to_path_buf());
        
        // Verify basic structure
        if self.options.verify_structure {
            self.verify_index_structure(&index_reader, &mut report)?;
        }
        
        // Verify counts if requested
        if self.options.verify_counts {
            self.verify_index_counts(&index_reader, &mut report)?;
        }
        
        Ok(report)
    }
    
    /// Verify pattern occurrence across multiple files
    pub fn verify_pattern(&self, pattern: &str, files: &[PathBuf]) -> BigGrepResult<Vec<VerificationResult>> {
        let verifier = TextVerifier::new(pattern);
        let mut results = Vec::new();
        
        for file_path in files {
            if !file_path.exists() {
                let mut result = VerificationResult::new(
                    file_path.clone(),
                    0,
                    Vec::new(),
                );
                result = result.with_error("File does not exist".to_string());
                results.push(result);
                continue;
            }
            
            let metadata = std::fs::metadata(file_path)
                .map_err(|e| BigGrepError::Io(e))?;
            
            if metadata.len() > self.options.max_file_size {
                let mut result = VerificationResult::new(
                    file_path.clone(),
                    0,
                    Vec::new(),
                );
                result = result.with_error("File too large".to_string());
                results.push(result);
                continue;
            }
            
            let result = verifier.verify_file(file_path)?;
            results.push(result);
        }
        
        Ok(results)
    }
    
    /// Verify N-gram counts against ground truth
    pub fn verify_counts(&self, index_path: &Path, ground_truth_files: &[PathBuf]) -> BigGrepResult<CountVerificationReport> {
        let index_reader = IndexReader::load(index_path)
            .map_err(|e| BigGrepError::Verification(e.to_string()))?;
        
        let mut report = CountVerificationReport::new(index_path.to_path_buf());
        
        // For each file in ground truth, count actual N-grams and compare
        for file_path in ground_truth_files {
            if file_path.exists() {
                let actual_counts = self.count_actual_ngrams(file_path, index_reader.order())?;
                // Compare with index counts (simplified)
                // In practice, you'd traverse the trie to get expected counts
                report.add_file_check(file_path.clone(), actual_counts.len() as u64, 0);
            }
        }
        
        Ok(report)
    }
    
    /// Verify N-gram index structure
    fn verify_index_structure(&self, index: &IndexReader, report: &mut IndexVerificationReport) -> BigGrepResult<()> {
        let stats = index.stats();
        
        // Basic structure checks
        if stats.node_count == 0 {
            report.add_error("Empty trie".to_string());
        }
        
        if stats.order < 2 || stats.order > 4 {
            report.add_error(format!("Invalid N-gram order: {}", stats.order));
        }
        
        if stats.vocab_size == 0 {
            report.add_error("Empty vocabulary".to_string());
        }
        
        // Performance metrics
        report.set_metrics(IndexMetrics {
            node_count: stats.node_count,
            vocab_size: stats.vocab_size,
            total_ngrams: stats.total_ngrams,
            compression_ratio: self.calculate_compression_ratio(&stats),
        });
        
        Ok(())
    }
    
    /// Verify N-gram counts in index
    fn verify_index_counts(&self, index: &IndexReader, report: &mut IndexVerificationReport) -> BigGrepResult<()> {
        // This would involve traversing the trie and checking count consistency
        // For now, we'll add a placeholder verification
        report.add_warning("Count verification not fully implemented".to_string());
        Ok(())
    }
    
    /// Count actual N-grams in a file for comparison
    fn count_actual_ngrams(&self, file_path: &Path, order: usize) -> BigGrepResult<HashMap<Vec<String>, u64>> {
        // Simplified implementation - would use actual tokenization
        // For now, return empty map
        Ok(HashMap::new())
    }
    
    /// Calculate compression ratio for the index
    fn calculate_compression_ratio(&self, stats: &crate::index::TrieStats) -> f64 {
        // Simplified calculation - would compare raw vs compressed sizes
        0.5 // Placeholder
    }
}

/// Index verification report
#[derive(Debug)]
pub struct IndexVerificationReport {
    pub index_path: PathBuf,
    pub errors: Vec<String>,
    pub warnings: Vec<String>,
    pub verified: bool,
    pub metrics: Option<IndexMetrics>,
}

impl IndexVerificationReport {
    pub fn new(index_path: PathBuf) -> Self {
        Self {
            index_path,
            errors: Vec::new(),
            warnings: Vec::new(),
            verified: true,
            metrics: None,
        }
    }
    
    pub fn add_error(&mut self, error: String) {
        self.errors.push(error);
        self.verified = false;
    }
    
    pub fn add_warning(&mut self, warning: String) {
        self.warnings.push(warning);
    }
    
    pub fn set_metrics(&mut self, metrics: IndexMetrics) {
        self.metrics = Some(metrics);
    }
}

/// Count verification report
#[derive(Debug)]
pub struct CountVerificationReport {
    pub index_path: PathBuf,
    pub file_checks: Vec<FileCheck>,
    pub total_mismatches: u64,
}

impl CountVerificationReport {
    pub fn new(index_path: PathBuf) -> Self {
        Self {
            index_path,
            file_checks: Vec::new(),
            total_mismatches: 0,
        }
    }
    
    pub fn add_file_check(&mut self, file_path: PathBuf, expected: u64, actual: u64) {
        let mismatched = expected != actual;
        if mismatched {
            self.total_mismatches += 1;
        }
        
        self.file_checks.push(FileCheck {
            file_path,
            expected_count: expected,
            actual_count: actual,
            mismatched,
        });
    }
}

/// Index metrics for performance analysis
#[derive(Debug, Clone)]
pub struct IndexMetrics {
    pub node_count: usize,
    pub vocab_size: usize,
    pub total_ngrams: usize,
    pub compression_ratio: f64,
}

/// Individual file check result
#[derive(Debug, Clone)]
pub struct FileCheck {
    pub file_path: PathBuf,
    pub expected_count: u64,
    pub actual_count: u64,
    pub mismatched: bool,
}

/// Spot-check verifier for random sampling
#[derive(Debug)]
pub struct SpotCheckVerifier {
    verifier: VerificationEngine,
    sample_rate: f64,
}

impl SpotCheckVerifier {
    pub fn new(verifier: VerificationEngine, sample_rate: f64) -> Self {
        Self {
            verifier,
            sample_rate: sample_rate.clamp(0.0, 1.0),
        }
    }
    
    /// Perform spot checks on randomly selected files
    pub fn spot_check(&self, files: &[PathBuf], pattern: &str) -> BigGrepResult<SpotCheckReport> {
        let mut report = SpotCheckReport::new(pattern.to_string());
        
        let sample_size = ((files.len() as f64) * self.sample_rate).ceil() as usize;
        let sampled_files = Self::sample_files(files, sample_size);
        
        for file_path in sampled_files {
            let result = self.verifier.verify_pattern(pattern, &[file_path])?;
            if let Some(verification_result) = result.first() {
                report.add_check(verification_result.clone());
            }
        }
        
        Ok(report)
    }
    
    /// Randomly sample files for spot checking
    fn sample_files(files: &[PathBuf], sample_size: usize) -> Vec<PathBuf> {
        use std::collections::hash_map::DefaultHasher;
        use std::hash::{Hash, Hasher};
        
        let mut sampled = Vec::new();
        let mut used_indices = HashSet::new();
        
        for _ in 0..sample_size.min(files.len()) {
            // Simple random selection based on hash
            let mut hasher = DefaultHasher::new();
            std::time::SystemTime::now().hash(&mut hasher);
            let hash = hasher.finish();
            let index = (hash as usize) % files.len();
            
            if used_indices.insert(index) {
                sampled.push(files[index].clone());
            }
        }
        
        sampled
    }
}

/// Spot check verification report
#[derive(Debug)]
pub struct SpotCheckReport {
    pub pattern: String,
    pub checks: Vec<VerificationResult>,
    pub total_files: usize,
    pub failed_files: usize,
}

impl SpotCheckReport {
    pub fn new(pattern: String) -> Self {
        Self {
            pattern,
            checks: Vec::new(),
            total_files: 0,
            failed_files: 0,
        }
    }
    
    pub fn add_check(&mut self, check: VerificationResult) {
        self.total_files += 1;
        if !check.verified {
            self.failed_files += 1;
        }
        self.checks.push(check);
    }
    
    pub fn success_rate(&self) -> f64 {
        if self.total_files == 0 {
            1.0
        } else {
            (self.total_files - self.failed_files) as f64 / self.total_files as f64
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;
    
    #[test]
    fn test_boyer_moore_horspool() {
        let matcher = BoyerMooreHorspool::new("hello");
        let text = "hello world hello universe";
        
        let matches = matcher.find_all(text.as_bytes());
        assert_eq!(matches, vec![0, 12]);
        
        assert_eq!(matches[0], 0);
        assert_eq!(matches[1], 12);
    }
    
    #[test]
    fn test_boyer_moore_horspool_no_match() {
        let matcher = BoyerMooreHorspool::new("xyz");
        let text = "hello world";
        
        let matches = matcher.find_all(text.as_bytes());
        assert!(matches.is_empty());
    }
    
    #[test]
    fn test_text_verifier() {
        let verifier = TextVerifier::new("test");
        let text = "this is a test string with test multiple times";
        
        let result = verifier.verify_text(text);
        assert_eq!(result.match_count, 2);
        assert_eq!(result.match_offsets.len(), 2);
    }
    
    #[test]
    fn test_verification_options() {
        let options = VerificationOptions::default();
        assert_eq!(options.max_file_size, 100 * 1024 * 1024);
        assert!(options.verify_counts);
        assert!(options.verify_structure);
    }
    
    #[test]
    fn test_index_verification_report() {
        let path = PathBuf::from("test.index");
        let mut report = IndexVerificationReport::new(path.clone());
        
        assert_eq!(report.index_path, path);
        assert!(report.verified);
        assert!(report.errors.is_empty());
        
        report.add_error("Test error".to_string());
        assert!(!report.verified);
        assert_eq!(report.errors.len(), 1);
    }
    
    #[test]
    fn test_spot_check_sampling() {
        let files: Vec<PathBuf> = (0..10).map(|i| PathBuf::from(format!("file{}.txt", i))).collect();
        let sampled = SpotCheckVerifier::sample_files(&files, 3);
        
        assert!(sampled.len() <= 3);
        assert!(sampled.len() > 0);
        
        // All sampled files should be from the original list
        for sampled_file in &sampled {
            assert!(files.contains(sampled_file));
        }
    }
}
