//! Index module with Elias-Fano trie implementation for compressed N-gram storage
//! 
//! Implements compressed N-gram index construction and search using Elias-Fano encoding
//! as specified in the design document.

use crate::error::{BigGrepError, BigGrepResult};
use crate::ngram::{NgramCount, Token, VocabularyMapper};
use std::collections::{BTreeMap, HashMap};
use std::io::{Read, Write, Seek, SeekFrom};
use byteorder::{ReadBytesExt, WriteBytesExt, BigEndian, LittleEndian};

/// Index header information
#[derive(Debug, Clone)]
pub struct IndexHeader {
    pub magic: [u8; 8],
    pub version: u32,
    pub order: u32, // N-gram order
    pub num_ngrams: u64,
    pub num_tokens: u32,
    pub flags: u32,
}

impl IndexHeader {
    pub fn new(order: u32, num_ngrams: u64, num_tokens: u32) -> Self {
        Self {
            magic: *b"BIGGREP1",
            version: 1,
            order,
            num_ngrams,
            num_tokens,
            flags: 0,
        }
    }
    
    pub fn validate(&self) -> BigGrepResult<()> {
        if &self.magic != b"BIGGREP1" {
            return Err(BigGrepError::Index("Invalid magic number".to_string()));
        }
        if self.version != 1 {
            return Err(BigGrepError::Index("Unsupported version".to_string()));
        }
        Ok(())
    }
}

/// Elias-Fano encoder for integers
#[derive(Debug, Clone)]
pub struct EliasFanoEncoder {
    lower_bits: Vec<u64>,
    higher_bits: Vec<u64>,
    lower_bit_count: usize,
    max_value: u64,
}

impl EliasFanoEncoder {
    pub fn new(values: &[u64]) -> Self {
        if values.is_empty() {
            return Self {
                lower_bits: Vec::new(),
                higher_bits: Vec::new(),
                lower_bit_count: 0,
                max_value: 0,
            };
        }
        
        let max_value = values.iter().max().unwrap().clone();
        let n = values.len() as u64;
        
        // Calculate lower bit count
        let lower_bit_count = ((max_value as f64).log2().floor() as usize).max(1);
        let upper_bound = (max_value >> lower_bit_count) + n + 1;
        let upper_bit_count = ((upper_bound as f64).log2().ceil() as usize).max(1);
        
        // Encode values
        let mut lower_bits = Vec::with_capacity(n as usize * lower_bit_count);
        let mut higher_bits = Vec::with_capacity(n as usize * upper_bit_count);
        
        for &value in values {
            // Lower bits
            let lower = (value & ((1 << lower_bit_count) - 1)) as u64;
            lower_bits.extend_from_slice(&lower.to_be_bytes()[8 - lower_bit_count/8..]);
            
            // Higher bits (prefix sum)
            let higher = value >> lower_bit_count;
            higher_bits.push(higher);
        }
        
        Self {
            lower_bits,
            higher_bits,
            lower_bit_count,
            max_value,
        }
    }
    
    pub fn encode(&self) -> Vec<u8> {
        let mut result = Vec::new();
        
        // Write metadata
        result.write_u64::<BigEndian>(self.max_value).unwrap();
        result.write_u64::<BigEndian>(self.lower_bit_count as u64).unwrap();
        result.write_u64::<BigEndian>(self.higher_bits.len() as u64).unwrap();
        
        // Write encoded data
        result.extend_from_slice(&self.lower_bits);
        result.extend_from_slice(&serialize_ranks(&self.higher_bits));
        
        result
    }
    
    pub fn decode(data: &[u8]) -> BigGrepResult<Self> {
        if data.len() < 24 {
            return Err(BigGrepError::Deserialization("Invalid EF data".to_string()));
        }
        
        let mut cursor = std::io::Cursor::new(data);
        let max_value = cursor.read_u64::<BigEndian>()?;
        let lower_bit_count = cursor.read_u64::<BigEndian>()? as usize;
        let higher_count = cursor.read_u64::<BigEndian>()? as usize;
        
        // Read lower bits
        let lower_bytes = (lower_bit_count * higher_count + 7) / 8;
        let mut lower_bits = vec![0u8; lower_bytes];
        cursor.read_exact(&mut lower_bits)?;
        
        // Read higher bits (with rank encoding)
        let higher_bits_data_len = data.len() - cursor.position() as usize;
        let mut higher_bits_data = vec![0u8; higher_bits_data_len];
        cursor.read_exact(&mut higher_bits_data)?;
        let higher_bits = deserialize_ranks(&higher_bits_data, higher_count)?;
        
        Ok(Self {
            lower_bits,
            higher_bits,
            lower_bit_count,
            max_value,
        })
    }
}

/// Simple rank data structure
fn serialize_ranks(values: &[u64]) -> Vec<u8> {
    values.iter().flat_map(|&v| v.to_be_bytes()).collect()
}

fn deserialize_ranks(data: &[u8], count: usize) -> BigGrepResult<Vec<u64>> {
    if data.len() < count * 8 {
        return Err(BigGrepError::Deserialization("Invalid rank data".to_string()));
    }
    
    let mut values = Vec::with_capacity(count);
    for i in 0..count {
        let bytes = &data[i * 8..(i + 1) * 8];
        let value = u64::from_be_bytes(bytes.try_into().unwrap());
        values.push(value);
    }
    
    Ok(values)
}

/// EF-Trie node structure
#[derive(Debug, Clone)]
struct EFTrieNode {
    token_id: u32,
    count: u64,
    children: Vec<usize>,
    is_leaf: bool,
}

impl EFTrieNode {
    fn new(token_id: u32) -> Self {
        Self {
            token_id,
            count: 0,
            children: Vec::new(),
            is_leaf: false,
        }
    }
}

/// Elias-Fano Trie implementation
#[derive(Debug)]
pub struct EFTrie {
    nodes: Vec<EFTrieNode>,
    order: usize,
    vocab: VocabularyMapper,
}

impl EFTrie {
    pub fn new(order: usize, vocab: VocabularyMapper) -> Self {
        Self {
            nodes: vec![EFTrieNode::new(0)], // Root node
            order,
            vocab,
        }
    }
    
    /// Insert an N-gram into the trie
    pub fn insert(&mut self, ngram: &[Token], count: u64) -> BigGrepResult<()> {
        if ngram.len() != self.order {
            return Err(BigGrepError::Index(format!(
                "Expected N-gram of order {}, got {}",
                self.order, ngram.len()
            )));
        }
        
        let mut current = 0usize; // Start at root
        
        for token in ngram {
            let token_id = self.vocab.get_id(&token.text)
                .ok_or_else(|| BigGrepError::Index(format!("Unknown token: {}", token.text)))?;
            
            // Find or create child
            let child_idx = if let Some(&idx) = self.nodes[current].children.iter()
                .find(|&&idx| self.nodes[idx].token_id == token_id) {
                idx
            } else {
                let new_node = EFTrieNode::new(token_id);
                let new_idx = self.nodes.len();
                self.nodes.push(new_node);
                self.nodes[current].children.push(new_idx);
                new_idx
            };
            
            current = child_idx;
        }
        
        // Mark leaf and update count
        self.nodes[current].is_leaf = true;
        self.nodes[current].count += count;
        
        Ok(())
    }
    
    /// Search for prefix matches
    pub fn search_prefix(&self, prefix: &[String]) -> Vec<&NgramCount> {
        let mut results = Vec::new();
        let prefix_len = prefix.len();
        
        if prefix_len == 0 {
            return self.collect_all_ngrams();
        }
        
        if prefix_len > self.order {
            return results;
        }
        
        // Convert prefix to token IDs
        let prefix_ids: Vec<u32> = prefix.iter()
            .filter_map(|token| self.vocab.get_id(token))
            .collect();
        
        if prefix_ids.len() != prefix_len {
            return results; // Unknown token in prefix
        }
        
        // Traverse to prefix location
        let mut current = 0usize;
        let mut found = true;
        
        for &token_id in &prefix_ids {
            if let Some(&child_idx) = self.nodes[current].children.iter()
                .find(|&&idx| self.nodes[idx].token_id == token_id) {
                current = child_idx;
            } else {
                found = false;
                break;
            }
        }
        
        if !found {
            return results;
        }
        
        // Collect all ngrams from this prefix
        self.collect_from_node(current, &mut results, &prefix_ids);
        
        results
    }
    
    fn collect_from_node(&self, node_idx: usize, results: &mut Vec<&NgramCount>, prefix: &[u32]) {
        let node = &self.nodes[node_idx];
        
        if node.is_leaf {
            let mut tokens = Vec::new();
            for &token_id in prefix {
                if let Some(token_str) = self.vocab.get_token(token_id) {
                    tokens.push(Token::new(token_str.to_string(), 0, token_str.len()));
                }
            }
            results.push(Box::leak(Box::new(NgramCount::new(tokens, node.count))));
        }
        
        for &child_idx in &node.children {
            let mut new_prefix = prefix.to_vec();
            new_prefix.push(self.nodes[child_idx].token_id);
            self.collect_from_node(child_idx, results, &new_prefix);
        }
    }
    
    fn collect_all_ngrams(&self) -> Vec<&NgramCount> {
        let mut results = Vec::new();
        let mut prefix = Vec::new();
        self.collect_from_node(0, &mut results, &prefix);
        results
    }
    
    /// Get statistics about the trie
    pub fn stats(&self) -> TrieStats {
        TrieStats {
            node_count: self.nodes.len(),
            order: self.order,
            vocab_size: self.vocab.size(),
            total_ngrams: self.nodes.iter().filter(|n| n.is_leaf).count(),
        }
    }
    
    /// Serialize the trie to bytes
    pub fn serialize(&self) -> BigGrepResult<Vec<u8>> {
        let mut data = Vec::new();
        
        // Serialize header
        let num_ngrams = self.nodes.iter().filter(|n| n.is_leaf).count() as u64;
        let header = IndexHeader::new(self.order as u32, num_ngrams, self.vocab.size() as u32);
        data.extend_from_slice(&serialize_header(&header)?);
        
        // Serialize vocabulary
        let vocab_data = bincode::serialize(&self.vocab)
            .map_err(|e| BigGrepError::Serialization(e.to_string()))?;
        data.write_u64::<BigEndian>(vocab_data.len() as u64)?;
        data.extend_from_slice(&vocab_data);
        
        // Serialize nodes
        let node_data = bincode::serialize(&self.nodes)
            .map_err(|e| BigGrepError::Serialization(e.to_string()))?;
        data.write_u64::<BigEndian>(node_data.len() as u64)?;
        data.extend_from_slice(&node_data);
        
        Ok(data)
    }
    
    /// Deserialize trie from bytes
    pub fn deserialize(data: &[u8]) -> BigGrepResult<Self> {
        let mut cursor = std::io::Cursor::new(data);
        
        // Read header
        let header = deserialize_header(&mut cursor)?;
        header.validate()?;
        
        // Read vocabulary
        let vocab_len = cursor.read_u64::<BigEndian>()?;
        let mut vocab_data = vec![0u8; vocab_len as usize];
        cursor.read_exact(&mut vocab_data)?;
        let vocab = bincode::deserialize(&vocab_data)
            .map_err(|e| BigGrepError::Deserialization(e.to_string()))?;
        
        // Read nodes
        let node_len = cursor.read_u64::<BigEndian>()?;
        let mut node_data = vec![0u8; node_len as usize];
        cursor.read_exact(&mut node_data)?;
        let nodes = bincode::deserialize(&node_data)
            .map_err(|e| BigGrepError::Deserialization(e.to_string()))?;
        
        Ok(Self {
            nodes,
            order: header.order as usize,
            vocab,
        })
    }
}

/// Trie statistics
#[derive(Debug, Clone)]
pub struct TrieStats {
    pub node_count: usize,
    pub order: usize,
    pub vocab_size: usize,
    pub total_ngrams: usize,
}

/// Index builder for constructing EF-tries from sorted N-gram counts
pub struct IndexBuilder {
    order: usize,
    vocab: VocabularyMapper,
}

impl IndexBuilder {
    pub fn new(order: usize) -> Self {
        Self {
            order,
            vocab: VocabularyMapper::new(),
        }
    }
    
    /// Build EF-trie from sorted N-gram counts
    pub fn build_ef_trie(&mut self, ngram_counts: &[NgramCount]) -> BigGrepResult<EFTrie> {
        if ngram_counts.is_empty() {
            return Err(BigGrepError::Index("Empty N-gram counts".to_string()));
        }
        
        // Build vocabulary from all tokens
        for ngram_count in ngram_counts {
            let token_texts: Vec<String> = ngram_count.tokens.iter()
                .map(|t| t.text.clone())
                .collect();
            self.vocab.add_tokens(&token_texts);
        }
        
        // Create trie and insert all N-grams
        let mut trie = EFTrie::new(self.order, self.vocab.clone());
        
        for ngram_count in ngram_counts {
            trie.insert(&ngram_count.tokens, ngram_count.count)?;
        }
        
        Ok(trie)
    }
    
    /// Get vocabulary
    pub fn vocab(&self) -> &VocabularyMapper {
        &self.vocab
    }
}

/// Index reader for accessing serialized indexes
pub struct IndexReader {
    trie: EFTrie,
    header: IndexHeader,
}

impl IndexReader {
    pub fn new(trie: EFTrie, header: IndexHeader) -> Self {
        Self { trie, header }
    }
    
    /// Load index from file
    pub fn load(path: &std::path::Path) -> BigGrepResult<Self> {
        let data = std::fs::read(path)
            .map_err(|e| BigGrepError::Io(e))?;
        Self::from_bytes(&data)
    }
    
    /// Load index from bytes
    pub fn from_bytes(data: &[u8]) -> BigGrepResult<Self> {
        let trie = EFTrie::deserialize(data)?;
        let header = IndexHeader::new(trie.order as u32, 0, trie.vocab.size() as u32);
        
        Ok(Self { trie, header })
    }
    
    /// Search for N-grams with given prefix
    pub fn search_prefix(&self, prefix: &[String]) -> Vec<NgramCount> {
        self.trie.search_prefix(prefix).into_iter()
            .map(|ngram| (*ngram).clone())
            .collect()
    }
    
    /// Get index statistics
    pub fn stats(&self) -> TrieStats {
        self.trie.stats()
    }
    
    /// Get N-gram order
    pub fn order(&self) -> usize {
        self.trie.order
    }
}

/// Index entry for search results
#[derive(Debug, Clone)]
pub struct IndexEntry {
    pub ngram: Vec<String>,
    pub count: u64,
    pub file_references: Vec<FileReference>,
}

impl IndexEntry {
    pub fn new(ngram: Vec<String>, count: u64) -> Self {
        Self {
            ngram,
            count,
            file_references: Vec::new(),
        }
    }
    
    pub fn with_files(mut self, files: Vec<FileReference>) -> Self {
        self.file_references = files;
        self
    }
}

/// File reference for N-gram occurrences
#[derive(Debug, Clone)]
pub struct FileReference {
    pub file_path: String,
    pub byte_offset: u64,
    pub context: Option<String>,
}

impl FileReference {
    pub fn new(file_path: String, byte_offset: u64) -> Self {
        Self {
            file_path,
            byte_offset,
            context: None,
        }
    }
    
    pub fn with_context(mut self, context: String) -> Self {
        self.context = Some(context);
        self
    }
}

/// Serialize index header
fn serialize_header(header: &IndexHeader) -> BigGrepResult<Vec<u8>> {
    let mut data = Vec::new();
    data.extend_from_slice(&header.magic);
    data.write_u32::<BigEndian>(header.version)?;
    data.write_u32::<BigEndian>(header.order)?;
    data.write_u64::<BigEndian>(header.num_ngrams)?;
    data.write_u32::<BigEndian>(header.num_tokens)?;
    data.write_u32::<BigEndian>(header.flags)?;
    Ok(data)
}

/// Deserialize index header
fn deserialize_header(cursor: &mut std::io::Cursor<&[u8]>) -> BigGrepResult<IndexHeader> {
    let mut magic = [0u8; 8];
    cursor.read_exact(&mut magic)?;
    
    let version = cursor.read_u32::<BigEndian>()?;
    let order = cursor.read_u32::<BigEndian>()?;
    let num_ngrams = cursor.read_u64::<BigEndian>()?;
    let num_tokens = cursor.read_u32::<BigEndian>()?;
    let flags = cursor.read_u32::<BigEndian>()?;
    
    Ok(IndexHeader {
        magic,
        version,
        order,
        num_ngrams,
        num_tokens,
        flags,
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::ngram::{TokenizerConfig, TokenStream};
    
    #[test]
    fn test_elias_fano_encoding() {
        let values = vec![1, 3, 4, 7, 8, 10, 15];
        let encoder = EliasFanoEncoder::new(&values);
        let encoded = encoder.encode();
        
        // Should be able to decode
        let decoder = EliasFanoEncoder::decode(&encoded).unwrap();
        assert_eq!(decoder.max_value, 15);
    }
    
    #[test]
    fn test_ef_trie_insertion() {
        let mut vocab = VocabularyMapper::new();
        vocab.add_tokens(&["hello".to_string(), "world".to_string()]);
        vocab.add_tokens(&["hello".to_string(), "universe".to_string()]);
        
        let mut trie = EFTrie::new(2, vocab);
        
        // Insert "hello world"
        let tokens = vec![
            Token::new("hello".to_string(), 0, 5),
            Token::new("world".to_string(), 0, 5),
        ];
        trie.insert(&tokens, 1).unwrap();
        
        // Insert "hello universe"
        let tokens = vec![
            Token::new("hello".to_string(), 0, 5),
            Token::new("universe".to_string(), 0, 8),
        ];
        trie.insert(&tokens, 2).unwrap();
        
        // Search prefix "hello"
        let results = trie.search_prefix(&["hello".to_string()]);
        assert_eq!(results.len(), 2);
    }
    
    #[test]
    fn test_index_builder() {
        let mut builder = IndexBuilder::new(3);
        
        let counts = vec![
            NgramCount::new(vec![
                Token::new("the".to_string(), 0, 3),
                Token::new("quick".to_string(), 0, 5),
                Token::new("brown".to_string(), 0, 5),
            ], 5),
            NgramCount::new(vec![
                Token::new("quick".to_string(), 0, 5),
                Token::new("brown".to_string(), 0, 5),
                Token::new("fox".to_string(), 0, 3),
            ], 3),
        ];
        
        let trie = builder.build_ef_trie(&counts).unwrap();
        assert_eq!(trie.order, 3);
        assert_eq!(trie.vocab.size(), 4); // the, quick, brown, fox
    }
}
