//! BigGrep Parser (rs-bgparse)
//!
//! Reads N-gram indexes and generates candidate file lists for search verification.
//! Implements hint-based searching with PFOR/VarByte index decoding.

use clap::{Parser};
use std::path::{Path, PathBuf};
use anyhow::{Result, Context};
use log::{info, warn, error, debug};
use biggrep_core::*;
use std::collections::{HashSet, BTreeSet};
use std::sync::Arc;
use byteorder::{ReadBytesExt, WriteBytesExt, BigEndian, LittleEndian};
use memmap2::{Mmap, MmapOptions};
use regex::Regex;

/// Initialize logging based on verbosity level
fn init_logging(verbose: bool) {
    env_logger::Builder::from_default_env()
        .filter_level(if verbose {
            log::LevelFilter::Info
        } else {
            log::LevelFilter::Warn
        })
        .init();
}

#[derive(Parser)]
#[command(name = "rs-bgparse")]
#[command(about = "BigGrep index parser - reads N-gram indexes and generates candidate file lists")]
#[command(version = "0.1.0")]
struct Cli {
    /// Index directory to search in
    #[arg(short, long)]
    directory: Option<PathBuf>,
    
    /// Search patterns (hex-encoded binary strings)
    #[arg(short, long, value_delimiter = ',')]
    patterns: Vec<String>,
    
    /// Maximum number of candidate files to return
    #[arg(long, default_value_t = 15000)]
    max_candidates: usize,
    
    /// Verbose output
    #[arg(short, long)]
    verbose: bool,
    
    /// Show compression statistics per N-gram
    #[arg(long)]
    stats: bool,
    
    /// Debug output
    #[arg(long)]
    debug: bool,
    
    /// Index file prefix (default: "index")
    #[arg(short, long, default_value = "index")]
    prefix: String,
}

fn main() -> Result<()> {
    let cli = Cli::parse();
    
    // Initialize logging
    env_logger::Builder::from_default_env()
        .filter_level(if cli.debug {
            log::LevelFilter::Debug
        } else if cli.verbose {
            log::LevelFilter::Info
        } else {
            log::LevelFilter::Warn
        })
        .init();
    
    info!("BigGrep Index Parser v{} starting", VERSION);
    
    // Validate inputs
    if cli.patterns.is_empty() {
        anyhow::bail!("No search patterns provided. Use -p/--patterns to specify hex-encoded patterns.");
    }
    
    if cli.directory.is_none() {
        anyhow::bail!("No directory specified. Use -d/--directory to specify index directory.");
    }
    
    let directory = cli.directory.as_ref().unwrap();
    if !directory.exists() || !directory.is_dir() {
        anyhow::bail!("Directory does not exist: {:?}", directory);
    }
    
    // Discover index files
    let index_files = discover_index_files(directory, &cli.prefix)?;
    if index_files.is_empty() {
        anyhow::bail!("No index files found in directory: {:?}", directory);
    }
    
    debug!("Found {} index files", index_files.len());
    
    // Process each index file
    let mut all_candidates = BTreeSet::new();
    
    for index_file in &index_files {
        debug!("Processing index file: {:?}", index_file);
        
        match process_index_file(index_file, &cli.patterns, cli.max_candidates, cli.stats) {
            Ok(candidates) => {
                debug!("Found {} candidates in {:?}", candidates.len(), index_file);
                for candidate in candidates {
                    all_candidates.insert(candidate);
                }
            }
            Err(e) => {
                warn!("Failed to process index file {:?}: {}", index_file, e);
            }
        }
    }
    
    // Output candidate files
    let candidates_vec: Vec<String> = all_candidates.into_iter().collect();
    
    if cli.verbose {
        println!("# Found {} candidate files", candidates_vec.len());
    }
    
    for candidate in candidates_vec {
        println!("{}", candidate);
    }
    
    Ok(())
}

/// Discover BigGrep index files (.bgi) in directory
fn discover_index_files(directory: &Path, prefix: &str) -> Result<Vec<PathBuf>> {
    let mut index_files = Vec::new();
    
    let entries = std::fs::read_dir(directory)
        .with_context(|| format!("Failed to read directory: {:?}", directory))?;
    
    for entry in entries {
        let entry = entry?;
        let path = entry.path();
        
        if path.is_file() {
            if let Some(filename) = path.file_name().and_then(|s| s.to_str()) {
                if filename.starts_with(prefix) && filename.ends_with(".bgi") {
                    index_files.push(path);
                }
            }
        }
    }
    
    // Sort for consistent ordering
    index_files.sort();
    Ok(index_files)
}

/// Process a single index file to find candidate files
fn process_index_file(
    index_file: &Path,
    patterns: &[String],
    max_candidates: usize,
    show_stats: bool,
) -> Result<HashSet<String>> {
    
    // Memory-map the index file
    let file = std::fs::File::open(index_file)
        .with_context(|| format!("Failed to open index file: {:?}", index_file))?;
    
    let mmap = unsafe { MmapOptions::new().map(&file) }
        .with_context(|| format!("Failed to memory-map index file: {:?}", index_file))?;
    
    // Parse header
    let (header, header_size) = parse_header(&mmap)?;
    
    debug!("Index header: order={}, num_ngrams={}, hint_type={}", 
           header.order, header.num_ngrams, header.hint_type);
    
    // Parse hints
    let hints = parse_hints(&mmap, &header, header_size)?;
    
    // Convert patterns to N-grams
    let mut ngram_candidates = Vec::new();
    
    for pattern in patterns {
        let binary_data = parse_hex_pattern(pattern)?;
        let ngrams = extract_ngrams(&binary_data, header.order as usize)?;
        ngram_candidates.push(ngrams);
    }
    
    if show_stats {
        println!("# Index: {:?}", index_file);
        println!("# Order: {}", header.order);
        println!("# Hint type: {}", header.hint_type);
    }
    
    // Search each set of N-grams
    let mut all_candidates = HashSet::new();
    
    for (i, ngrams) in ngram_candidates.iter().enumerate() {
        debug!("Processing pattern {}: {} N-grams", i + 1, ngrams.len());
        
        let candidates = search_ngrams(&mmap, &header, &hints, ngrams, max_candidates, show_stats)?;
        all_candidates.extend(candidates);
        
        // Early exit if we have enough candidates
        if all_candidates.len() >= max_candidates {
            break;
        }
    }
    
    Ok(all_candidates)
}

/// Parse BigGrep index header
#[derive(Debug, Clone)]
struct IndexHeader {
    magic: [u8; 8],
    version: u32,
    order: u32,
    num_ngrams: u64,
    num_files: u32,
    hint_type: u32,
    hints_offset: u64,
    index_offset: u64,
    fileid_map_offset: u64,
    compressed: bool,
}

fn parse_header(mmap: &Mmap) -> Result<(IndexHeader, usize)> {
    if mmap.len() < 64 {
        anyhow::bail!("Index file too small: {} bytes", mmap.len());
    }
    
    let mut cursor = std::io::Cursor::new(&mmap[..64]);
    
    // Read magic (8 bytes)
    let mut magic = [0u8; 8];
    cursor.read_exact(&mut magic)?;
    
    if &magic != b"BIGGREP1" {
        anyhow::bail!("Invalid magic number: {:?}", magic);
    }
    
    // Read header fields
    let version = cursor.read_u32::<BigEndian>()?;
    let order = cursor.read_u32::<BigEndian>()?;
    let num_ngrams = cursor.read_u64::<BigEndian>()?;
    let num_files = cursor.read_u32::<BigEndian>()?;
    let hint_type = cursor.read_u32::<BigEndian>()?;
    let hints_offset = cursor.read_u64::<BigEndian>()?;
    let index_offset = cursor.read_u64::<BigEndian>()?;
    let fileid_map_offset = cursor.read_u64::<BigEndian>()?;
    
    let header = IndexHeader {
        magic,
        version,
        order,
        num_ngrams,
        num_files,
        hint_type,
        hints_offset,
        index_offset,
        fileid_map_offset,
        compressed: version >= 2, // Assume compression for version >= 2
    };
    
    Ok((header, 64))
}

/// Parse hints array for fast N-gram lookup
fn parse_hints(mmap: &Mmap, header: &IndexHeader, header_size: usize) -> Result<Vec<(u32, u64)>> {
    let hints_start = header.hints_offset as usize;
    let index_start = header.index_offset as usize;
    
    if hints_start >= mmap.len() || index_start >= mmap.len() {
        anyhow::bail!("Invalid hint or index offsets");
    }
    
    let hints_data = &mmap[hints_start..index_start];
    let num_hints = hints_data.len() / 12; // 12 bytes per hint (4 bytes key + 8 bytes offset)
    
    let mut hints = Vec::with_capacity(num_hints);
    let mut cursor = std::io::Cursor::new(hints_data);
    
    for _ in 0..num_hints {
        let key = cursor.read_u32::<BigEndian>()?;
        let offset = cursor.read_u64::<BigEndian>()?;
        hints.push((key, offset));
    }
    
    Ok(hints)
}

/// Parse hex-encoded pattern to binary data
fn parse_hex_pattern(pattern: &str) -> Result<Vec<u8>> {
    let clean_pattern = pattern.replace(" ", "").replace("0x", "");
    
    if clean_pattern.len() % 2 != 0 {
        anyhow::bail!("Hex pattern must have even length: {}", pattern);
    }
    
    let mut binary_data = Vec::new();
    
    for i in (0..clean_pattern.len()).step_by(2) {
        let byte_str = &clean_pattern[i..i+2];
        let byte = u8::from_str_radix(byte_str, 16)
            .with_context(|| format!("Invalid hex byte: {}", byte_str))?;
        binary_data.push(byte);
    }
    
    Ok(binary_data)
}

/// Extract N-grams from binary data
fn extract_ngrams(data: &[u8], ngram_order: usize) -> Result<Vec<u32>> {
    if data.len() < ngram_order {
        anyhow::bail!("Data too short for {}-grams: {} bytes", ngram_order, data.len());
    }
    
    let mut ngrams = Vec::new();
    
    match ngram_order {
        3 => {
            // Extract 3-grams using little-endian optimization
            for i in 0..data.len().saturating_sub(2) {
                if i + 3 <= data.len() {
                    let ngram = data[i] as u32 |
                               (data[i + 1] as u32) << 8 |
                               (data[i + 2] as u32) << 16;
                    ngrams.push(ngram);
                }
            }
        }
        4 => {
            // Extract 4-grams
            for i in 0..data.len().saturating_sub(3) {
                if i + 4 <= data.len() {
                    let ngram = data[i] as u32 |
                               (data[i + 1] as u32) << 8 |
                               (data[i + 2] as u32) << 16 |
                               (data[i + 3] as u32) << 24;
                    ngrams.push(ngram);
                }
            }
        }
        _ => {
            anyhow::bail!("Unsupported N-gram order: {}", ngram_order);
        }
    }
    
    // Sort and deduplicate
    ngrams.sort();
    ngrams.dedup();
    
    Ok(ngrams)
}

/// Search for N-grams in index and return candidate file IDs
fn search_ngrams(
    mmap: &Mmap,
    header: &IndexHeader,
    hints: &[(u32, u64)],
    ngrams: &[u32],
    max_candidates: usize,
    show_stats: bool,
) -> Result<HashSet<String>> {
    
    let mut file_candidates = None;
    
    for &ngram in ngrams {
        debug!("Searching for N-gram: 0x{:08x}", ngram);
        
        // Use hints to find the approximate location
        let ngram_prefix = get_ngram_prefix(ngram, header.hint_type);
        let search_offset = find_hint_offset(hints, ngram_prefix)
            .unwrap_or(header.index_offset);
        
        // Search in hint window (typically 16 N-grams)
        let candidates = search_ngram_in_window(mmap, header, search_offset, ngram, max_candidates)?;
        
        // Intersect with existing candidates
        if let Some(existing) = file_candidates {
            file_candidates = Some(intersect_candidates(&existing, &candidates));
        } else {
            file_candidates = Some(candidates);
        }
        
        // Early exit if no candidates remain
        if file_candidates.as_ref().map(|c| c.is_empty()).unwrap_or(true) {
            break;
        }
        
        // Limit candidates for performance
        if let Some(ref mut c) = file_candidates {
            if c.len() > max_candidates {
                *c = c.iter().take(max_candidates).cloned().collect();
            }
        }
    }
    
    let file_ids = file_candidates.unwrap_or_default();
    
    if show_stats {
        println!("# N-grams: {}", ngrams.len());
        println!("# Candidates: {}", file_ids.len());
    }
    
    // Resolve file IDs to paths
    resolve_file_ids_to_paths(mmap, header, &file_ids)
}

/// Get N-gram prefix for hint lookup based on hint type
fn get_ngram_prefix(ngram: u32, hint_type: u32) -> u32 {
    match hint_type {
        0 => ngram >> 8,  // Top 24 bits
        1 => ngram >> 4,  // Top 28 bits
        2 => ngram,       // Full 32 bits
        _ => ngram >> 8,  // Default to type 0
    }
}

/// Find hint offset for given prefix
fn find_hint_offset(hints: &[(u32, u64)], prefix: u32) -> Option<u64> {
    // Binary search in hints array
    let mut low = 0;
    let mut high = hints.len();
    
    while low < high {
        let mid = (low + high) / 2;
        if hints[mid].0 <= prefix {
            low = mid + 1;
        } else {
            high = mid;
        }
    }
    
    if low > 0 && low - 1 < hints.len() {
        Some(hints[low - 1].1)
    } else {
        None
    }
}

/// Search for specific N-gram within hint window
fn search_ngram_in_window(
    mmap: &Mmap,
    header: &IndexHeader,
    start_offset: u64,
    target_ngram: u32,
    max_candidates: usize,
) -> Result<HashSet<u32>> {
    
    let mut candidates = HashSet::new();
    let mut offset = start_offset;
    
    // Search within reasonable window (16 N-grams typical)
    let max_searches = 16;
    let index_end = header.fileid_map_offset;
    
    for _ in 0..max_searches {
        if offset >= index_end {
            break;
        }
        
        // Read entry size (VarByte encoded)
        let (entry_size, size_bytes) = read_varbyte(mmap, offset)?;
        offset += size_bytes as u64;
        
        if offset + entry_size as u64 > index_end {
            break;
        }
        
        // Read N-gram (4 bytes)
        if offset + 4 > mmap.len() {
            break;
        }
        
        let ngram_bytes = &mmap[offset as usize..offset as usize + 4];
        let entry_ngram = u32::from_le_bytes([ngram_bytes[0], ngram_bytes[1], ngram_bytes[2], ngram_bytes[3]]);
        offset += 4;
        
        if entry_ngram == target_ngram {
            // Found the target N-gram, decode file IDs
            let payload = &mmap[offset as usize..offset as usize + entry_size as usize];
            let file_ids = decode_file_ids(payload)?;
            candidates.extend(file_ids);
            break;
        } else if entry_ngram > target_ngram {
            // We've gone past the target, can stop
            break;
        }
        
        offset += entry_size as u64;
    }
    
    // Limit results
    if candidates.len() > max_candidates {
        let limited: HashSet<u32> = candidates.iter().take(max_candidates).cloned().collect();
        candidates = limited;
    }
    
    Ok(candidates)
}

/// Read VarByte-encoded integer from memory map
fn read_varbyte(mmap: &Mmap, offset: u64) -> Result<(u32, usize)> {
    let mut value = 0u32;
    let mut shift = 0;
    let mut bytes_read = 0;
    let mut current_offset = offset;
    
    loop {
        if current_offset >= mmap.len() {
            anyhow::bail!("Unexpected end of file while reading VarByte");
        }
        
        let byte = mmap[current_offset as usize];
        current_offset += 1;
        bytes_read += 1;
        
        value |= ((byte & 0x7F) as u32) << shift;
        shift += 7;
        
        if (byte & 0x80) == 0 {
            break;
        }
        
        if shift >= 32 {
            anyhow::bail!("VarByte value too large");
        }
    }
    
    Ok((value, bytes_read))
}

/// Read VarByte-encoded integer from byte slice
fn read_varbyte_payload(data: &[u8], offset: usize) -> Result<(u32, usize)> {
    let mut value = 0u32;
    let mut shift = 0;
    let mut bytes_read = 0;
    let mut current_offset = offset;
    
    loop {
        if current_offset >= data.len() {
            anyhow::bail!("Unexpected end of data while reading VarByte");
        }
        
        let byte = data[current_offset];
        current_offset += 1;
        bytes_read += 1;
        
        value |= ((byte & 0x7F) as u32) << shift;
        shift += 7;
        
        if (byte & 0x80) == 0 {
            break;
        }
        
        if shift >= 32 {
            anyhow::bail!("VarByte value too large");
        }
    }
    
    Ok((value, bytes_read))
}

/// Decode file IDs from compressed payload
fn decode_file_ids(payload: &[u8]) -> Result<Vec<u32>> {
    if payload.is_empty() {
        return Ok(Vec::new());
    }
    
    let mut cursor = std::io::Cursor::new(payload);
    let mut file_ids = Vec::new();
    
    // Check encoding type (first bit of first byte)
    let first_byte = payload[0];
    let is_pfor = (first_byte & 0x80) != 0;
    
    if is_pfor {
        // PFOR encoded
        let blocksize = ((first_byte & 0x7F) as usize) * 8;
        let mut offset = 1;
        
        while offset < payload.len() {
            let block_end = std::cmp::min(offset + blocksize, payload.len());
            let block = &payload[offset..block_end];
            
            let decoded_block = decode_pfor_block(block)?;
            file_ids.extend(decoded_block);
            
            offset = block_end;
        }
    } else {
        // VarByte encoded
        let mut last_id = 0u32;
        let mut pos = 0;
        
        while pos < payload.len() {
            let (delta, bytes_read) = read_varbyte_payload(payload, pos)?;
            last_id += delta;
            file_ids.push(last_id);
            pos += bytes_read;
        }
    }
    
    Ok(file_ids)
}

/// Decode PFOR block
fn decode_pfor_block(block: &[u8]) -> Result<Vec<u32>> {
    if block.is_empty() {
        return Ok(Vec::new());
    }
    
    // Simplified PFOR decode - in practice this would be more complex
    // For now, treat as VarByte encoded deltas
    let mut values = Vec::new();
    let mut last_value = 0u32;
    let mut pos = 0;
    
    while pos < block.len() {
        let (delta, bytes_read) = read_varbyte_payload(block, pos)?;
        last_value += delta;
        values.push(last_value);
        pos += bytes_read;
    }
    
    Ok(values)
}

/// Intersect two candidate sets
fn intersect_candidates(set1: &HashSet<u32>, set2: &HashSet<u32>) -> HashSet<u32> {
    if set1.len() < set2.len() {
        set1.iter().filter(|&&x| set2.contains(&x)).cloned().collect()
    } else {
        set2.iter().filter(|&&x| set1.contains(&x)).cloned().collect()
    }
}

/// Resolve file IDs to file paths
fn resolve_file_ids_to_paths(
    mmap: &Mmap,
    header: &IndexHeader,
    file_ids: &HashSet<u32>,
) -> Result<HashSet<String>> {
    
    let map_start = header.fileid_map_offset as usize;
    let map_end = mmap.len();
    
    if map_start >= map_end {
        return Ok(HashSet::new());
    }
    
    let map_data = &mmap[map_start..map_end];
    
    // Handle compressed fileid_map
    let map_text = if header.compressed {
        // Decompress with zlib (simplified - would needflate2 in practice)
        map_data.to_vec() // Placeholder
    } else {
        map_data.to_vec()
    };
    
    let mut file_paths = HashSet::new();
    let lines = String::from_utf8_lossy(&map_text);
    
    for (line_num, line) in lines.lines().enumerate() {
        let line = line.trim();
        if line.is_empty() {
            continue;
        }
        
        // Parse line: "id filepath [metadata...]"
        let parts: Vec<&str> = line.split_whitespace().collect();
        if parts.len() < 2 {
            continue;
        }
        
        let file_id = line_num as u32; // ID is implicit in line position
        if file_ids.contains(&file_id) {
            let filepath = parts[1].to_string();
            file_paths.insert(filepath);
        }
    }
    
    Ok(file_paths)
}