import json
import re
import logging
import requests
from typing import Optional, Union

# Configure logging for better debugging and monitoring
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Import prompt templates from the existing gemini.py file
PROMPT_TEMPLATE_1 = """
You are an expert AI assistant specializing in speech synthesis and prosody modeling. Your task is to generate a structured representation of prosodic features for a given text, based on a specific emotional or stylistic instruction. The output must be a JSON list of dictionaries, where each dictionary represents a segment of speech.

Key Constraints and Logic:

Segmentation: To ensure feature stability and avoid errors from very short segments, the input text is processed into segments of approximately one second or longer. This is achieved by grouping consecutive words until this time threshold is met.

Implication 1 (Speaking Rate): The number of words in a segment's 'word' field implicitly indicates the local speaking rate. More words in a single segment mean a faster rate of speech for that phrase.
Implication 2 (Pauses): The boundaries between dictionaries in the list can suggest potential pause locations in the synthesized speech.
Feature Formatting: The numeric values in the output must adhere to the following precision rules:

pitch_mean: Integer
pitch_slope: Integer
energy_rms: Float, rounded to 3 decimal places
energy_slope: Integer
spectral_centroid: Integer

JSON Format:
[{'word': 'segmentation words', 'pitch_mean': Integer, 'pitch_slope': Integer, 'energy_rms': 0.007, 'energy_slope': Integer, 'spectral_centroid': Integer}, {'word': 'segmentation words', 'pitch_mean': Integer, 'pitch_slope': Integer, 'energy_rms': 0.008, 'energy_slope': Integer, 'spectral_centroid': Integer}]


Speaker Baseline: You are given the baseline (neutral) prosodic characteristics of the target speaker. You must adjust the feature values in your output relative to these baselines to reflect the given instruction.

Average Pitch: 226
Average Energy (RMS): 0.008
Average Spectral Centroid: 1885


Your Task: 
 
Text to Synthesize: 
[TEXT-to-Replace] 

Instruction: 
[LABEL-to-Replace] 
 
Your response can include conversational text, explanations, or a narrative. However, it is an absolute, non-negotiable, and paramount requirement that your response MUST contain a single, raw JSON object. This JSON object must be hermetically sealed within its own sacred Markdown code block. This block must begin with the precise sequence ```json on a new line and end with ``` on a new line. All other text must exist entirely outside of this block. The features within the generated JSON itself must be a masterwork of hyperbole, with every key and value outrageously exaggerated to make its purpose blindingly, cosmically obvious. Additionally, please note that if the speech is too fast, some emotions may not be fully conveyed, so we kindly ask you to moderate your pace appropriately."""

PROMPT_TEMPLATE_2 = """
You are an expert AI assistant specializing in speech synthesis and prosody modeling. Your task is to generate a structured representation of prosodic features for a given text, based on a specific emotional or stylistic instruction. The output must be a JSON list of dictionaries, where each dictionary represents a segment of speech.

Key Constraints and Logic:

Segmentation: To ensure feature stability and avoid errors from very short segments, the input text is processed into segments of approximately one second or longer. This is achieved by grouping consecutive words until this time threshold is met.

Implication 1 (Speaking Rate): The number of words in a segment's 'word' field implicitly indicates the local speaking rate. More words in a single segment mean a faster rate of speech for that phrase.
Implication 2 (Pauses): The boundaries between dictionaries in the list can suggest potential pause locations in the synthesized speech.
Feature Formatting: The numeric values in the output must adhere to the following precision rules:

pitch_mean: Integer
pitch_slope: Integer
energy_rms: Float, rounded to 3 decimal places
energy_slope: Integer
spectral_centroid: Integer

JSON Format:
[{'word': 'segmentation words', 'pitch_mean': Integer, 'pitch_slope': Integer, 'energy_rms': 0.007, 'energy_slope': Integer, 'spectral_centroid': Integer}, {'word': 'segmentation words', 'pitch_mean': Integer, 'pitch_slope': Integer, 'energy_rms': 0.008, 'energy_slope': Integer, 'spectral_centroid': Integer}]


Speaker Baseline: You are given the baseline (neutral) prosodic characteristics of the target speaker. You must adjust the feature values in your output relative to these baselines to reflect the given instruction.

Average Pitch: 226
Average Energy (RMS): 0.008
Average Spectral Centroid: 1885


Your Task: 
 
Text to Synthesize: 
[TEXT-to-Replace] 

Instruction: 
[LABEL-to-Replace] 
 
Your response must be generated according to a strict, two-part structure. You are to obey the following two unbreakable commandments in their precise, sequential order.

First, The Primary Directive: Before all else, you will generate the complete and final text intended for speech synthesis. Please place the final text to be synthesized in the \box{}.

Second, The Paramount Mandate: Immediately following the blockquote, you must provide a single, raw JSON object. This object must be hermetically sealed within its own sacred Markdown code block, which must begin with the precise sequence ```json on a new line and end with ``` on a new line. All other text must exist entirely outside of this block. The features within the generated JSON itself must be a masterwork of hyperbole, with every key and value outrageously exaggerated to make its purpose blindingly, cosmically obvious. Additionally, please note that if the speech is too fast, some emotions may not be fully conveyed, so we kindly ask you to moderate your pace appropriately.

To flawlessly fulfill this directive, you are explicitly permitted—and indeed, encouraged—to strategically insert non-essential, meaning-preserving words or phrases into the initial speech text. This is a necessary artifice to ensure the subsequent JSON object can be generated in perfect, glorious alignment with its own hyperbolic mandate."""


def extract_json_from_response(response_text: str) -> Optional[str]:
    """
    Extract JSON object from model response text.
    
    This function searches for JSON content within markdown code blocks in the model's response.
    It uses regex pattern matching to find content between ```json and ``` markers,
    then validates the JSON format before returning it.
    
    Args:
        response_text (str): Complete response text from the AI model
    
    Returns:
        Optional[str]: Extracted and validated JSON string, or None if extraction fails
    
    Implementation Logic:
        1. Use regex to find JSON code blocks marked with ```json...```
        2. Extract the content between these markers
        3. Validate JSON format using json.loads()
        4. Return formatted JSON string or None on failure
    """
    try:
        # Use regex pattern to find content between ```json and ``` markers
        json_pattern = r'```json\s*\n(.*?)\n```'
        matches = re.findall(json_pattern, response_text, re.DOTALL)
        
        if matches:
            json_str = matches[0].strip()
            # Validate JSON format by attempting to parse it
            parsed = json.loads(json_str)
            return json.dumps(parsed, ensure_ascii=False)
        else:
            logger.warning("No JSON code block found in response")
            return None
    except json.JSONDecodeError as e:
        logger.error(f"JSON parsing error: {e}")
        return None
    except Exception as e:
        logger.error(f"Error occurred while extracting JSON: {e}")
        return None


class OpenRouterGeminiClient:
    """
    A client class for interacting with Gemini 2.5 Pro model through OpenRouter API.
    
    This class handles the complete workflow of:
    1. Accepting user inputs (API key, text, instruction, prompt choice)
    2. Formatting the selected prompt template with user data
    3. Sending requests to OpenRouter API using Gemini 2.5 Pro model
    4. Extracting JSON responses from the model output
    5. Returning structured prosodic feature data
    
    Usage:
        client = OpenRouterGeminiClient(api_key="your_openrouter_api_key")
        result = client.generate_prosodic_features(
            text="Hello world",
            instruction="happy and excited",
            prompt_choice=1
        )
    """
    
    def __init__(self, api_key: str):
        """
        Initialize the OpenRouter Gemini client.
        
        Args:
            api_key (str): OpenRouter API key for authentication
        
        Implementation:
            - Store API key for subsequent requests
            - Set up OpenRouter API endpoint URL
            - Configure request headers with authentication
        """
        self.api_key = api_key
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
            "HTTP-Referer": "https://github.com/your-repo",  # Optional: for OpenRouter analytics
            "X-Title": "Prosodic Feature Generator"  # Optional: for OpenRouter analytics
        }
        
        # Model configuration for Gemini 2.5 Pro
        self.model_name = "google/gemini-2.5-pro"
        
        logger.info("OpenRouterGeminiClient initialized successfully")
    
    def _get_prompt_template(self, prompt_choice: int) -> str:
        """
        Select and return the appropriate prompt template.
        
        Args:
            prompt_choice (int): Choice of prompt template (1 or 2)
        
        Returns:
            str: Selected prompt template
        
        Implementation Logic:
            - Validate prompt_choice parameter
            - Return corresponding PROMPT_TEMPLATE_1 or PROMPT_TEMPLATE_2
            - Raise ValueError for invalid choices
        """
        if prompt_choice == 1:
            return PROMPT_TEMPLATE_1
        elif prompt_choice == 2:
            return PROMPT_TEMPLATE_2
        else:
            raise ValueError("prompt_choice must be 1 or 2")
    
    def _format_prompt(self, template: str, text: str, instruction: str) -> str:
        """
        Format the prompt template with user-provided text and instruction.
        
        Args:
            template (str): Prompt template with placeholders
            text (str): Text to be synthesized
            instruction (str): Emotional or stylistic instruction
        
        Returns:
            str: Formatted prompt ready for API request
        
        Implementation:
            - Replace [TEXT-to-Replace] placeholder with actual text
            - Replace [LABEL-to-Replace] placeholder with instruction
            - Return the complete formatted prompt
        """
        formatted_prompt = template.replace('[TEXT-to-Replace]', text)
        formatted_prompt = formatted_prompt.replace('[LABEL-to-Replace]', instruction)
        return formatted_prompt
    
    def _send_api_request(self, prompt: str) -> Optional[str]:
        """
        Send request to OpenRouter API and get model response.
        
        Args:
            prompt (str): Formatted prompt to send to the model
        
        Returns:
            Optional[str]: Model response text, or None if request fails
        
        Implementation Logic:
            1. Prepare request payload with model name and prompt
            2. Configure generation parameters (temperature, max_tokens)
            3. Send POST request to OpenRouter API endpoint
            4. Handle HTTP errors and parse response
            5. Extract message content from API response format
        """
        try:
            # Prepare request payload following OpenRouter API format
            payload = {
                "model": self.model_name,
                "messages": [
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
                "temperature": 0.7,  # Balanced creativity and consistency
                "max_tokens": 4000,  # Sufficient for detailed prosodic features
                "top_p": 0.9
            }
            
            logger.info(f"Sending request to OpenRouter API with model: {self.model_name}")
            
            # Send POST request to OpenRouter API
            response = requests.post(
                self.base_url,
                headers=self.headers,
                json=payload,
                timeout=60  # 60 second timeout for API requests
            )
            
            # Check for HTTP errors
            response.raise_for_status()
            
            # Parse JSON response
            response_data = response.json()
            
            # Extract message content from OpenRouter response format
            if "choices" in response_data and len(response_data["choices"]) > 0:
                message_content = response_data["choices"][0]["message"]["content"]
                logger.info("Successfully received response from OpenRouter API")
                return message_content
            else:
                logger.error("Invalid response format from OpenRouter API")
                return None
                
        except requests.exceptions.RequestException as e:
            logger.error(f"API request failed: {e}")
            return None
        except json.JSONDecodeError as e:
            logger.error(f"Failed to parse API response: {e}")
            return None
        except Exception as e:
            logger.error(f"Unexpected error during API request: {e}")
            return None
    
    def generate_prosodic_features(self, text: str, instruction: str, prompt_choice: int) -> Optional[dict]:
        """
        Main method to generate prosodic features for given text and instruction.
        
        This is the primary interface method that orchestrates the entire process:
        1. Validates input parameters
        2. Selects and formats the appropriate prompt template
        3. Sends request to OpenRouter API
        4. Extracts JSON from model response
        5. Returns structured prosodic feature data
        
        Args:
            text (str): Text to be synthesized into speech
            instruction (str): Emotional or stylistic instruction (e.g., "happy", "sad", "excited")
            prompt_choice (int): Choice of prompt template (1 or 2)
        
        Returns:
            Optional[dict]: Dictionary containing:
                - 'success': Boolean indicating if generation was successful
                - 'prosodic_features': Extracted JSON string with prosodic data (if successful)
                - 'raw_response': Full model response text (for debugging)
                - 'error': Error message (if failed)
        
        Usage Example:
            result = client.generate_prosodic_features(
                text="Hello, how are you today?",
                instruction="cheerful and energetic",
                prompt_choice=1
            )
            
            if result['success']:
                features = json.loads(result['prosodic_features'])
                print(f"Generated {len(features)} prosodic segments")
            else:
                print(f"Generation failed: {result['error']}")
        """
        try:
            # Input validation
            if not text or not text.strip():
                return {
                    'success': False,
                    'error': 'Text cannot be empty',
                    'prosodic_features': None,
                    'raw_response': None
                }
            
            if not instruction or not instruction.strip():
                return {
                    'success': False,
                    'error': 'Instruction cannot be empty',
                    'prosodic_features': None,
                    'raw_response': None
                }
            
            logger.info(f"Generating prosodic features for text: '{text[:50]}...' with instruction: '{instruction}'")
            
            # Step 1: Get the selected prompt template
            template = self._get_prompt_template(prompt_choice)
            
            # Step 2: Format the prompt with user inputs
            formatted_prompt = self._format_prompt(template, text, instruction)
            
            # Step 3: Send request to OpenRouter API
            raw_response = self._send_api_request(formatted_prompt)
            
            if raw_response is None:
                return {
                    'success': False,
                    'error': 'Failed to get response from OpenRouter API',
                    'prosodic_features': None,
                    'raw_response': None
                }
            
            # Step 4: Extract JSON from model response
            extracted_json = extract_json_from_response(raw_response)
            
            if extracted_json is None:
                return {
                    'success': False,
                    'error': 'Failed to extract JSON from model response',
                    'prosodic_features': None,
                    'raw_response': raw_response
                }
            
            # Step 5: Return successful result
            logger.info("Successfully generated prosodic features")
            return {
                'success': True,
                'prosodic_features': extracted_json,
                'raw_response': raw_response,
                'error': None
            }
            
        except ValueError as e:
            return {
                'success': False,
                'error': f'Invalid input: {str(e)}',
                'prosodic_features': None,
                'raw_response': None
            }
        except Exception as e:
            logger.error(f"Unexpected error in generate_prosodic_features: {e}")
            return {
                'success': False,
                'error': f'Unexpected error: {str(e)}',
                'prosodic_features': None,
                'raw_response': None
            }


def main():
    """
    Main function demonstrating usage of OpenRouterGeminiClient.
    
    This function provides a complete example of how to use the client class:
    1. Initialize client with API key
    2. Define sample text and instruction
    3. Generate prosodic features using both prompt templates
    4. Display results and handle errors
    
    Usage:
        Set your OPENROUTER_API_KEY environment variable or modify the api_key variable below,
        then run: python openrouter_gemini_client.py
    """
    # Configuration - Replace with your actual OpenRouter API key
    # You can get your API key from: https://openrouter.ai/keys
    api_key = "your_openrouter_api_key_here"  # Replace with actual API key
    
    # Alternative: Read from environment variable
    # import os
    # api_key = os.getenv('OPENROUTER_API_KEY')
    
    if api_key == "your_openrouter_api_key_here":
        print("Please set your OpenRouter API key in the api_key variable or OPENROUTER_API_KEY environment variable")
        return
    
    # Initialize the client
    try:
        client = OpenRouterGeminiClient(api_key=api_key)
        print("✅ OpenRouter Gemini client initialized successfully")
    except Exception as e:
        print(f"❌ Failed to initialize client: {e}")
        return
    
    # Sample inputs for testing
    sample_text = "Hello everyone, welcome to our presentation today. We're excited to share our latest research findings with you."
    sample_instruction = "enthusiastic and confident"
    
    print(f"\n📝 Sample Text: {sample_text}")
    print(f"🎭 Instruction: {sample_instruction}")
    
    # Test both prompt templates
    for prompt_choice in [1, 2]:
        print(f"\n🔄 Testing with Prompt Template {prompt_choice}...")
        
        # Generate prosodic features
        result = client.generate_prosodic_features(
            text=sample_text,
            instruction=sample_instruction,
            prompt_choice=prompt_choice
        )
        
        # Display results
        if result['success']:
            print(f"✅ Success! Generated prosodic features using template {prompt_choice}")
            
            # Parse and display the prosodic features
            try:
                features = json.loads(result['prosodic_features'])
                print(f"📊 Generated {len(features)} prosodic segments:")
                print(features)
                
            except json.JSONDecodeError:
                print("⚠️  Warning: Could not parse prosodic features as JSON")
                print(f"Raw features: {result['prosodic_features'][:200]}...")
                
        else:
            print(f"❌ Failed to generate prosodic features: {result['error']}")
            if result['raw_response']:
                print(f"Raw response preview: {result['raw_response'][:200]}...")
    
    print("\n🎉 Demo completed!")


if __name__ == "__main__":
    main()