Source code for src.analysis.ai_insights

"""AI-powered insights using OpenAI GPT-4."""

import os
from typing import Dict, List, Any, Optional
import pandas as pd
from openai import OpenAI

from ..utils.errors import ProcessingError
from ..utils.logging import get_logger

logger = get_logger(__name__)



[docs]
class AIInsightEngine:
    """Generate AI-powered insights using GPT-4."""
    
    def __init__(self, api_key: Optional[str] = None):
        """
        Initialize AI insight engine.
        
        Args:
            api_key: OpenAI API key (defaults to OPENAI_API_KEY env var)
        """
        self.api_key = api_key or os.getenv("OPENAI_API_KEY")
        if not self.api_key:
            logger.warning("OpenAI API key not found - AI insights will be disabled")
            self.client = None
        else:
            self.client = OpenAI(api_key=self.api_key)
            logger.info("AI insight engine initialized")
    

[docs]
    def generate_executive_summary(
        self,
        df: pd.DataFrame,
        period_metrics: pd.DataFrame,
        market_concentration: Dict[str, Any]
    ) -> str:
        """
        Generate executive summary of findings.
        
        Args:
            df: Full time series DataFrame
            period_metrics: Aggregate period metrics
            market_concentration: Market concentration metrics
            
        Returns:
            Executive summary text
        """
        if not self.client:
            return self._fallback_executive_summary(period_metrics)
        
        try:
            # Prepare data summary
            data_summary = self._prepare_data_summary(df, period_metrics, market_concentration)
            
            prompt = f"""You are a statistical analyst with strict scientific integrity. Analyze this Share of Search data.

DATA QUALITY NOTICE:
- Google Trends uses undisclosed sampling methods (Choi & Varian 2012)
- Same query on different days shows correlation of only 0.79-0.94 (Cebrián & Domenech 2023)
- Documented measurement error: ±5% variability between retrievals
- Coverage bias: excludes non-Google users and specialized platforms

Data Summary:
{data_summary}

Provide a 3-paragraph statistical summary:

1. Market Position Analysis:
   - Report shares with ±5% measurement error acknowledgment
   - Use correlation language, not causal claims
   - State "observed patterns" not "explanations"

2. Temporal Patterns:
   - Report statistical trends (direction, correlation coefficients where applicable)
   - Acknowledge alternative explanations
   - Use "correlated with" not "caused by"

3. Statistical Context:
   - Market concentration metrics
   - Volatility comparisons
   - Data limitations that affect interpretation

CONSTRAINTS:
- NEVER claim causation without experimental evidence
- ALWAYS quantify uncertainty
- Use statistical language throughout
- Acknowledge data limitations explicitly"""

            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "You are a rigorous statistical analyst. You NEVER make causal claims without experimental evidence. You ALWAYS acknowledge measurement error and data limitations."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,
                max_tokens=600
            )
            
            summary = response.choices[0].message.content.strip()
            logger.info("Generated AI executive summary")
            
            return summary
            
        except Exception as e:
            logger.warning(f"AI summary generation failed: {e}")
            return self._fallback_executive_summary(period_metrics)

    

[docs]
    def explain_anomalies(
        self,
        df: pd.DataFrame,
        anomalies: pd.DataFrame
    ) -> List[Dict[str, str]]:
        """
        Generate statistical descriptions of detected anomalies.
        
        Args:
            df: Full DataFrame
            anomalies: DataFrame of anomalous points
            
        Returns:
            List of anomaly descriptions
        """
        if not self.client or anomalies.empty:
            return []
        
        try:
            explanations = []
            
            # Limit to top 3 anomalies
            top_anomalies = anomalies.nlargest(3, 'z_score')
            
            for _, anomaly in top_anomalies.iterrows():
                query = anomaly['query']
                date = anomaly['date'].strftime('%Y-%m-%d')
                value = anomaly['share_of_search']
                z_score = anomaly['z_score']
                
                # Get context (surrounding data)
                query_data = df[df['query'] == query].sort_values('date')
                avg_value = query_data['share_of_search'].mean()
                deviation_pct = ((value - avg_value) / avg_value) * 100
                
                prompt = f"""STATISTICAL OBSERVATION: Anomaly detected in Share of Search data.

Brand: {query}
Date: {date}
Observed Share: {value:.1f}%
Average Share: {avg_value:.1f}%
Deviation: {deviation_pct:+.1f}%
Z-score: {z_score:.2f}

DATA QUALITY NOTICE: Google Trends has ±5% measurement error (Cebrián & Domenech 2023).

Provide a 2-3 sentence statistical description:
1. Quantify the deviation in statistical terms
2. Note that causation cannot be determined from this data alone
3. List 2-3 possible explanations that would require independent verification

CONSTRAINTS:
- Use "correlated with" or "associated with" not "caused by"
- State "requires external validation" for any causal hypothesis
- Acknowledge measurement error could explain part or all of the deviation
- Do NOT make definitive causal claims"""

                response = self.client.chat.completions.create(
                    model="gpt-4",
                    messages=[{"role": "system", "content": "You are a statistical analyst. You describe observations without claiming causation. You ALWAYS acknowledge measurement error and the need for external validation."},
                               {"role": "user", "content": prompt}],
                    temperature=0.3,
                    max_tokens=200
                )
                
                explanation = response.choices[0].message.content.strip()
                
                explanations.append({
                    "query": query,
                    "date": date,
                    "value": f"{value:.1f}%",
                    "explanation": explanation
                })
            
            logger.info(f"Generated {len(explanations)} anomaly descriptions")
            return explanations
            
        except Exception as e:
            logger.warning(f"Anomaly description failed: {e}")
            return []

    

[docs]
    def generate_competitive_insights(
        self,
        period_metrics: pd.DataFrame
    ) -> str:
        """
        Generate statistical competitive analysis.
        
        Args:
            period_metrics: Aggregate metrics per brand
            
        Returns:
            Statistical competitive analysis text
        """
        if not self.client:
            return ""
        
        try:
            # Prepare metrics summary
            metrics_text = period_metrics.to_string()
            
            prompt = f"""Analyze these Share of Search metrics with statistical rigor:

DATA QUALITY NOTICE:
- Google Trends measurement error: ±5% (Cebrián & Domenech 2023)
- Share values should be interpreted with this uncertainty
- Correlations do not imply causation

{metrics_text}

Provide statistical analysis in 4 sections:

1. RELATIVE POSITIONING
   - Report rank order with measurement uncertainty
   - State "Brand X observed at Y% ± 5%" not "Brand X dominates"
   - Use statistical language

2. VOLATILITY PATTERNS
   - Compare volatility metrics quantitatively
   - Higher volatility = less predictable search patterns
   - Acknowledge this is descriptive, not prescriptive

3. OBSERVED CORRELATIONS
   - Note any patterns in the data
   - Use "correlated with" or "associated with"
   - Do NOT claim these explain performance differences

4. DATA LIMITATIONS
   - Remind that this is search data only, not market performance
   - Note coverage biases (excludes non-Google users)
   - Strategic decisions require additional data sources

CONSTRAINTS:
- NO causal claims without experimental evidence
- NO strategic recommendations (requires external data)
- Use statistical language throughout
- Acknowledge uncertainty explicitly"""

            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[{"role": "system", "content": "You are a statistical analyst. You describe patterns in data without making causal or strategic claims. You ALWAYS acknowledge measurement error and data limitations."},
                           {"role": "user", "content": prompt}],
                temperature=0.3,
                max_tokens=700
            )
            
            insights = response.choices[0].message.content.strip()
            logger.info("Generated competitive statistical analysis")
            
            return insights
            
        except Exception as e:
            logger.warning(f"Competitive analysis generation failed: {e}")
            return ""

    

[docs]
    def generate_recommendations(
        self,
        df: pd.DataFrame,
        period_metrics: pd.DataFrame
    ) -> Dict[str, str]:
        """
        Generate statistical profiles per brand (NOT strategic recommendations).
        
        Args:
            df: Full time series
            period_metrics: Aggregate metrics
            
        Returns:
            Dictionary mapping brand to statistical profile
        """
        if not self.client:
            return {}
        
        try:
            profiles = {}
            
            for _, row in period_metrics.iterrows():
                brand = row['query']
                avg_share = row['avg_share']
                volatility = row.get('volatility', 0)
                
                # Get trend info
                brand_data = df[df['query'] == brand]
                trend = brand_data['trend_direction'].iloc[-1] if len(brand_data) > 0 else 'stable'
                
                prompt = f"""Generate a statistical profile for this brand (NOT recommendations):

Brand: {brand}
Average Share of Search: {avg_share:.1f}% (±5% measurement error)
Trend Direction: {trend}
Volatility: {volatility:.2f}

DATA QUALITY NOTICE: Google Trends has ±5% measurement error (Cebrián & Domenech 2023)

Provide a 3-point statistical profile:

1. RELATIVE POSITION
   - Where this brand sits relative to competitors in the dataset
   - Note measurement uncertainty
   
2. TEMPORAL PATTERN
   - Describe the trend statistically (direction, approximate slope)
   - State "observed correlation with time" not "performance improvement"
   
3. VARIABILITY CONTEXT
   - Interpret volatility value
   - Higher volatility = less predictable search patterns
   - This is descriptive only

CONSTRAINTS:
- NO strategic recommendations (requires domain expertise + external data)
- NO causal claims
- Statistical description only
- Acknowledge this is search data, not market performance
- End with: "Strategic decisions require additional data sources beyond Google Trends"
"""

                response = self.client.chat.completions.create(
                    model="gpt-4",
                    messages=[{"role": "system", "content": "You are a statistical analyst. You provide statistical descriptions ONLY, never strategic advice. You ALWAYS acknowledge measurement error and data limitations."},
                               {"role": "user", "content": prompt}],
                    temperature=0.3,
                    max_tokens=2000
                )
                
                profiles[brand] = response.choices[0].message.content.strip()
            
            logger.info(f"Generated statistical profiles for {len(profiles)} brands")
            return profiles
            
        except Exception as e:
            logger.warning(f"Statistical profile generation failed: {e}")
            return {}

    
    def _prepare_data_summary(
        self,
        df: pd.DataFrame,
        period_metrics: pd.DataFrame,
        market_concentration: Dict[str, Any]
    ) -> str:
        """Prepare data summary for AI analysis."""
        
        summary_parts = []
        
        # Period info
        start_date = df['date'].min().strftime('%Y-%m-%d')
        end_date = df['date'].max().strftime('%Y-%m-%d')
        summary_parts.append(f"Period: {start_date} to {end_date}")
        
        # Market concentration
        hhi = market_concentration.get('hhi', 0)
        concentration = market_concentration.get('concentration', 'unknown')
        summary_parts.append(f"\nMarket Concentration: HHI={hhi:.0f} ({concentration})")
        
        # Brand metrics
        summary_parts.append("\nBrand Performance:")
        for _, row in period_metrics.iterrows():
            brand = row['query']
            avg_share = row['avg_share']
            volatility = row.get('volatility', 0)
            summary_parts.append(f"- {brand}: {avg_share:.1f}% avg share, volatility={volatility:.2f}")
        
        # Trends
        summary_parts.append("\nTrends:")
        for query in df['query'].unique():
            query_data = df[df['query'] == query]
            if len(query_data) > 0:
                trend = query_data['trend_direction'].iloc[-1]
                slope = query_data['trend_slope'].iloc[-1]
                summary_parts.append(f"- {query}: {trend} (slope={slope:.3f})")
        
        return "\n".join(summary_parts)
    
    def _fallback_executive_summary(self, period_metrics: pd.DataFrame) -> str:
        """Generate fallback summary without AI."""
        
        # Find leader
        leader = period_metrics.loc[period_metrics['avg_share'].idxmax()]
        
        summary = f"""Share of Search Analysis Summary

Market Leader: {leader['query']} with {leader['avg_share']:.1f}% average share.

The analysis covers {len(period_metrics)} brands/queries over the specified period. """
        
        if len(period_metrics) > 1:
            second = period_metrics.nlargest(2, 'avg_share').iloc[1]
            summary += f"Second place: {second['query']} at {second['avg_share']:.1f}% share. "
        
        summary += "\n\nFor detailed AI-powered insights, please configure an OpenAI API key."
        
        return summary