//! N-gram processing module for tokenization and counting
//! 
//! Supports 3-gram and 4-gram processing as specified in the requirements.
//! Implements streaming tokenization and N-gram counting with compression.

use crate::error::{BigGrepError, BigGrepResult};
use std::collections::{HashMap, BTreeMap};
use std::hash::{Hash, Hasher};
use std::cmp::Ordering;

/// A token extracted from text during tokenization
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Token {
    pub text: String,
    pub position: usize,
    pub length: usize,
}

impl Token {
    pub fn new(text: String, position: usize, length: usize) -> Self {
        Self { text, position, length }
    }
}

/// Represents an N-gram with its count
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct NgramCount {
    pub tokens: Vec<Token>,
    pub count: u64,
}

impl NgramCount {
    pub fn new(tokens: Vec<Token>, count: u64) -> Self {
        Self { tokens, count }
    }
    
    pub fn is_empty(&self) -> bool {
        self.tokens.is_empty() || self.count == 0
    }
}

/// Tokenization configuration
#[derive(Debug, Clone)]
pub struct TokenizerConfig {
    pub normalize_case: bool,
    pub ignore_punctuation: bool,
    pub min_token_length: usize,
    pub unicode_support: bool,
    pub chunk_size: usize,
}

impl Default for TokenizerConfig {
    fn default() -> Self {
        Self {
            normalize_case: true,
            ignore_punctuation: true,
            min_token_length: 2,
            unicode_support: false,
            chunk_size: 1024 * 1024, // 1MB chunks
        }
    }
}

/// Streaming token iterator
pub struct TokenStream<'a> {
    text: &'a str,
    config: TokenizerConfig,
    position: usize,
}

impl<'a> TokenStream<'a> {
    pub fn new(text: &'a str, config: TokenizerConfig) -> Self {
        Self {
            text,
            config,
            position: 0,
        }
    }
}

impl<'a> Iterator for TokenStream<'a> {
    type Item = BigGrepResult<Token>;
    
    fn next(&mut self) -> Option<Self::Item> {
        if self.position >= self.text.len() {
            return None;
        }
        
        let start = self.position;
        let mut end = start;
        
        // Skip whitespace
        while end < self.text.len() && self.text.as_bytes()[end].is_ascii_whitespace() {
            end += 1;
        }
        
        if end >= self.text.len() {
            return None;
        }
        
        // Extract token
        let token_start = end;
        while end < self.text.len() && 
              (self.text.as_bytes()[end].is_ascii_alphanumeric() || 
               (self.config.unicode_support && self.text.as_bytes()[end] & 0x80 != 0)) {
            end += 1;
        }
        
        if end <= token_start {
            self.position = end + 1;
            return self.next();
        }
        
        let mut token_text = self.text[token_start..end].to_string();
        
        // Apply normalization
        if self.config.normalize_case {
            token_text = token_text.to_lowercase();
        }
        
        // Check minimum length
        if token_text.len() < self.config.min_token_length {
            self.position = end;
            return self.next();
        }
        
        let token = Token::new(token_text, token_start, end - token_start);
        self.position = end;
        Some(Ok(token))
    }
}

/// N-gram counter that processes tokens in chunks
pub struct NgramCounter {
    config: TokenizerConfig,
    order: usize, // N-gram order (3 or 4)
}

impl NgramCounter {
    pub fn new(order: usize, config: TokenizerConfig) -> Self {
        assert!(order >= 2 && order <= 4, "Order must be between 2 and 4");
        Self { config, order }
    }
    
    /// Count N-grams from a token stream
    pub fn count_tokens<I>(&self, tokens: I) -> BigGrepResult<HashMap<Vec<String>, u64>>
    where
        I: Iterator<Item = BigGrepResult<Token>>,
    {
        let mut counts = HashMap::new();
        let mut buffer = Vec::with_capacity(self.order);
        
        for token_result in tokens {
            let token = token_result.map_err(|e| BigGrepError::Parse(e.to_string()))?;
            
            buffer.push(token.text);
            
            if buffer.len() == self.order {
                // We have a complete N-gram
                let ngram = buffer.clone();
                *counts.entry(ngram).or_insert(0) += 1;
                
                // Slide window for next N-gram
                buffer.remove(0);
            }
        }
        
        Ok(counts)
    }
    
    /// Count N-grams from text chunks in parallel
    pub fn count_chunks<I, R>(&self, chunks: I) -> BigGrepResult<HashMap<Vec<String>, u64>>
    where
        I: Iterator<Item = R>,
        R: AsRef<str> + Send + Sync,
    {
        use rayon::prelude::*;
        
        let chunk_counts: Vec<HashMap<Vec<String>, u64>> = chunks
            .par_bridge()
            .map(|chunk| {
                let tokens = TokenStream::new(chunk.as_ref(), self.config.clone());
                self.count_tokens(tokens).unwrap_or_default()
            })
            .collect();
        
        // Merge counts from all chunks
        let mut total_counts = HashMap::new();
        for chunk_count in chunk_counts {
            for (ngram, count) in chunk_count {
                *total_counts.entry(ngram).or_insert(0) += count;
            }
        }
        
        Ok(total_counts)
    }
    
    /// Get the N-gram order
    pub fn order(&self) -> usize {
        self.order
    }
}

/// Sorter for N-gram counts
pub struct NgramSorter {
    order: usize,
}

impl NgramSorter {
    pub fn new(order: usize) -> Self {
        Self { order }
    }
    
    /// Sort N-grams by their string representation
    pub fn sort_counts(&self, counts: HashMap<Vec<String>, u64>) -> Vec<NgramCount> {
        let mut items: Vec<_> = counts
            .into_iter()
            .map(|(tokens, count)| NgramCount::new(
                tokens.into_iter().map(|text| Token::new(text, 0, 0)).collect(),
                count
            ))
            .collect();
        
        items.sort_by(|a, b| {
            let a_str: Vec<&str> = a.tokens.iter().map(|t| t.text.as_str()).collect();
            let b_str: Vec<&str> = b.tokens.iter().map(|t| t.text.as_str()).collect();
            
            for i in 0..std::cmp::min(a_str.len(), b_str.len()) {
                match a_str[i].cmp(b_str[i]) {
                    Ordering::Equal => continue,
                    other => return other,
                }
            }
            a_str.len().cmp(&b_str.len())
        });
        
        items
    }
}

/// Vocabulary mapper for token compression
pub struct VocabularyMapper {
    token_to_id: HashMap<String, u32>,
    id_to_token: Vec<String>,
}

impl VocabularyMapper {
    pub fn new() -> Self {
        Self {
            token_to_id: HashMap::new(),
            id_to_token: Vec::new(),
        }
    }
    
    /// Add tokens to vocabulary and return their IDs
    pub fn add_tokens(&mut self, tokens: &[String]) -> Vec<u32> {
        tokens.iter().map(|token| {
            self.token_to_id.entry(token.clone())
                .or_insert_with(|| {
                    let id = self.id_to_token.len() as u32;
                    self.id_to_token.push(token.clone());
                    id
                })
                .clone()
        }).collect()
    }
    
    /// Get token by ID
    pub fn get_token(&self, id: u32) -> Option<&str> {
        self.id_to_token.get(id as usize).map(|s| s.as_str())
    }
    
    /// Get ID for token
    pub fn get_id(&self, token: &str) -> Option<u32> {
        self.token_to_id.get(token).cloned()
    }
    
    /// Get vocabulary size
    pub fn size(&self) -> usize {
        self.id_to_token.len()
    }
}

/// Main N-gram processor that orchestrates tokenization and counting
pub struct NgramProcessor {
    counter: NgramCounter,
    sorter: NgramSorter,
    config: TokenizerConfig,
}

impl NgramProcessor {
    pub fn new(order: usize) -> Self {
        let config = TokenizerConfig::default();
        Self {
            counter: NgramCounter::new(order, config.clone()),
            sorter: NgramSorter::new(order),
            config,
        }
    }
    
    pub fn with_config(order: usize, config: TokenizerConfig) -> Self {
        Self {
            counter: NgramCounter::new(order, config.clone()),
            sorter: NgramSorter::new(order),
            config,
        }
    }
    
    /// Process text and return sorted N-gram counts
    pub fn process_text(&self, text: &str) -> BigGrepResult<Vec<NgramCount>> {
        let tokens = TokenStream::new(text, self.config.clone());
        let counts = self.counter.count_tokens(tokens)?;
        Ok(self.sorter.sort_counts(counts))
    }
    
    /// Get the N-gram order
    pub fn order(&self) -> usize {
        self.counter.order()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    
    #[test]
    fn test_tokenization() {
        let config = TokenizerConfig::default();
        let text = "Hello world, this is a test.";
        let tokens: Vec<_> = TokenStream::new(text, config)
            .collect::<Result<Vec<_>, _>>()
            .unwrap();
        
        assert_eq!(tokens.len(), 6);
        assert_eq!(tokens[0].text, "hello");
        assert_eq!(tokens[1].text, "world");
    }
    
    #[test]
    fn test_trigram_counting() {
        let config = TokenizerConfig::default();
        let counter = NgramCounter::new(3, config);
        
        let text = "the quick brown fox jumps over the lazy dog";
        let tokens = TokenStream::new(text, config.clone());
        let counts = counter.count_tokens(tokens).unwrap();
        
        // Should have multiple 3-grams
        assert!(counts.len() > 0);
        
        // Check specific 3-gram
        let key = vec!["the".to_string(), "quick".to_string(), "brown".to_string()];
        assert!(counts.contains_key(&key));
    }
    
    #[test]
    fn test_vocabulary_mapping() {
        let mut mapper = VocabularyMapper::new();
        let tokens = vec!["hello".to_string(), "world".to_string(), "hello".to_string()];
        
        let ids = mapper.add_tokens(&tokens);
        assert_eq!(ids, vec![0, 1, 0]);
        
        assert_eq!(mapper.get_token(0), Some("hello"));
        assert_eq!(mapper.get_token(1), Some("world"));
        assert_eq!(mapper.get_id("hello"), Some(0));
        assert_eq!(mapper.get_id("world"), Some(1));
    }
    
    #[test]
    fn test_ngram_processor() {
        let processor = NgramProcessor::new(3);
        let text = "the quick brown fox jumps over the lazy dog the quick brown";
        let counts = processor.process_text(text).unwrap();
        
        assert!(counts.len() > 0);
        assert_eq!(processor.order(), 3);
        
        // Check that results are sorted
        for i in 0..counts.len().saturating_sub(1) {
            let a_str: Vec<&str> = counts[i].tokens.iter().map(|t| t.text.as_str()).collect();
            let b_str: Vec<&str> = counts[i + 1].tokens.iter().map(|t| t.text.as_str()).collect();
            
            let a_str_repr = a_str.join(" ");
            let b_str_repr = b_str.join(" ");
            assert!(a_str_repr <= b_str_repr, "Results should be sorted: {} >= {}", a_str_repr, b_str_repr);
        }
    }
}
