Source code for src.analysis.ai_insights

"""AI-powered insights using OpenAI GPT-4."""

import os
from typing import Dict, List, Any, Optional
import pandas as pd
from openai import OpenAI

from ..utils.errors import ProcessingError
from ..utils.logging import get_logger

logger = get_logger(__name__)


[docs] class AIInsightEngine: """Generate AI-powered insights using GPT-4.""" def __init__(self, api_key: Optional[str] = None): """ Initialize AI insight engine. Args: api_key: OpenAI API key (defaults to OPENAI_API_KEY env var) """ self.api_key = api_key or os.getenv("OPENAI_API_KEY") if not self.api_key: logger.warning("OpenAI API key not found - AI insights will be disabled") self.client = None else: self.client = OpenAI(api_key=self.api_key) logger.info("AI insight engine initialized")
[docs] def generate_executive_summary( self, df: pd.DataFrame, period_metrics: pd.DataFrame, market_concentration: Dict[str, Any] ) -> str: """ Generate executive summary of findings. Args: df: Full time series DataFrame period_metrics: Aggregate period metrics market_concentration: Market concentration metrics Returns: Executive summary text """ if not self.client: return self._fallback_executive_summary(period_metrics) try: # Prepare data summary data_summary = self._prepare_data_summary(df, period_metrics, market_concentration) prompt = f"""You are a statistical analyst with strict scientific integrity. Analyze this Share of Search data. DATA QUALITY NOTICE: - Google Trends uses undisclosed sampling methods (Choi & Varian 2012) - Same query on different days shows correlation of only 0.79-0.94 (Cebrián & Domenech 2023) - Documented measurement error: ±5% variability between retrievals - Coverage bias: excludes non-Google users and specialized platforms Data Summary: {data_summary} Provide a 3-paragraph statistical summary: 1. Market Position Analysis: - Report shares with ±5% measurement error acknowledgment - Use correlation language, not causal claims - State "observed patterns" not "explanations" 2. Temporal Patterns: - Report statistical trends (direction, correlation coefficients where applicable) - Acknowledge alternative explanations - Use "correlated with" not "caused by" 3. Statistical Context: - Market concentration metrics - Volatility comparisons - Data limitations that affect interpretation CONSTRAINTS: - NEVER claim causation without experimental evidence - ALWAYS quantify uncertainty - Use statistical language throughout - Acknowledge data limitations explicitly""" response = self.client.chat.completions.create( model="gpt-4", messages=[ {"role": "system", "content": "You are a rigorous statistical analyst. You NEVER make causal claims without experimental evidence. You ALWAYS acknowledge measurement error and data limitations."}, {"role": "user", "content": prompt} ], temperature=0.3, max_tokens=600 ) summary = response.choices[0].message.content.strip() logger.info("Generated AI executive summary") return summary except Exception as e: logger.warning(f"AI summary generation failed: {e}") return self._fallback_executive_summary(period_metrics)
[docs] def explain_anomalies( self, df: pd.DataFrame, anomalies: pd.DataFrame ) -> List[Dict[str, str]]: """ Generate statistical descriptions of detected anomalies. Args: df: Full DataFrame anomalies: DataFrame of anomalous points Returns: List of anomaly descriptions """ if not self.client or anomalies.empty: return [] try: explanations = [] # Limit to top 3 anomalies top_anomalies = anomalies.nlargest(3, 'z_score') for _, anomaly in top_anomalies.iterrows(): query = anomaly['query'] date = anomaly['date'].strftime('%Y-%m-%d') value = anomaly['share_of_search'] z_score = anomaly['z_score'] # Get context (surrounding data) query_data = df[df['query'] == query].sort_values('date') avg_value = query_data['share_of_search'].mean() deviation_pct = ((value - avg_value) / avg_value) * 100 prompt = f"""STATISTICAL OBSERVATION: Anomaly detected in Share of Search data. Brand: {query} Date: {date} Observed Share: {value:.1f}% Average Share: {avg_value:.1f}% Deviation: {deviation_pct:+.1f}% Z-score: {z_score:.2f} DATA QUALITY NOTICE: Google Trends has ±5% measurement error (Cebrián & Domenech 2023). Provide a 2-3 sentence statistical description: 1. Quantify the deviation in statistical terms 2. Note that causation cannot be determined from this data alone 3. List 2-3 possible explanations that would require independent verification CONSTRAINTS: - Use "correlated with" or "associated with" not "caused by" - State "requires external validation" for any causal hypothesis - Acknowledge measurement error could explain part or all of the deviation - Do NOT make definitive causal claims""" response = self.client.chat.completions.create( model="gpt-4", messages=[{"role": "system", "content": "You are a statistical analyst. You describe observations without claiming causation. You ALWAYS acknowledge measurement error and the need for external validation."}, {"role": "user", "content": prompt}], temperature=0.3, max_tokens=200 ) explanation = response.choices[0].message.content.strip() explanations.append({ "query": query, "date": date, "value": f"{value:.1f}%", "explanation": explanation }) logger.info(f"Generated {len(explanations)} anomaly descriptions") return explanations except Exception as e: logger.warning(f"Anomaly description failed: {e}") return []
[docs] def generate_competitive_insights( self, period_metrics: pd.DataFrame ) -> str: """ Generate statistical competitive analysis. Args: period_metrics: Aggregate metrics per brand Returns: Statistical competitive analysis text """ if not self.client: return "" try: # Prepare metrics summary metrics_text = period_metrics.to_string() prompt = f"""Analyze these Share of Search metrics with statistical rigor: DATA QUALITY NOTICE: - Google Trends measurement error: ±5% (Cebrián & Domenech 2023) - Share values should be interpreted with this uncertainty - Correlations do not imply causation {metrics_text} Provide statistical analysis in 4 sections: 1. RELATIVE POSITIONING - Report rank order with measurement uncertainty - State "Brand X observed at Y% ± 5%" not "Brand X dominates" - Use statistical language 2. VOLATILITY PATTERNS - Compare volatility metrics quantitatively - Higher volatility = less predictable search patterns - Acknowledge this is descriptive, not prescriptive 3. OBSERVED CORRELATIONS - Note any patterns in the data - Use "correlated with" or "associated with" - Do NOT claim these explain performance differences 4. DATA LIMITATIONS - Remind that this is search data only, not market performance - Note coverage biases (excludes non-Google users) - Strategic decisions require additional data sources CONSTRAINTS: - NO causal claims without experimental evidence - NO strategic recommendations (requires external data) - Use statistical language throughout - Acknowledge uncertainty explicitly""" response = self.client.chat.completions.create( model="gpt-4", messages=[{"role": "system", "content": "You are a statistical analyst. You describe patterns in data without making causal or strategic claims. You ALWAYS acknowledge measurement error and data limitations."}, {"role": "user", "content": prompt}], temperature=0.3, max_tokens=700 ) insights = response.choices[0].message.content.strip() logger.info("Generated competitive statistical analysis") return insights except Exception as e: logger.warning(f"Competitive analysis generation failed: {e}") return ""
[docs] def generate_recommendations( self, df: pd.DataFrame, period_metrics: pd.DataFrame ) -> Dict[str, str]: """ Generate statistical profiles per brand (NOT strategic recommendations). Args: df: Full time series period_metrics: Aggregate metrics Returns: Dictionary mapping brand to statistical profile """ if not self.client: return {} try: profiles = {} for _, row in period_metrics.iterrows(): brand = row['query'] avg_share = row['avg_share'] volatility = row.get('volatility', 0) # Get trend info brand_data = df[df['query'] == brand] trend = brand_data['trend_direction'].iloc[-1] if len(brand_data) > 0 else 'stable' prompt = f"""Generate a statistical profile for this brand (NOT recommendations): Brand: {brand} Average Share of Search: {avg_share:.1f}% (±5% measurement error) Trend Direction: {trend} Volatility: {volatility:.2f} DATA QUALITY NOTICE: Google Trends has ±5% measurement error (Cebrián & Domenech 2023) Provide a 3-point statistical profile: 1. RELATIVE POSITION - Where this brand sits relative to competitors in the dataset - Note measurement uncertainty 2. TEMPORAL PATTERN - Describe the trend statistically (direction, approximate slope) - State "observed correlation with time" not "performance improvement" 3. VARIABILITY CONTEXT - Interpret volatility value - Higher volatility = less predictable search patterns - This is descriptive only CONSTRAINTS: - NO strategic recommendations (requires domain expertise + external data) - NO causal claims - Statistical description only - Acknowledge this is search data, not market performance - End with: "Strategic decisions require additional data sources beyond Google Trends" """ response = self.client.chat.completions.create( model="gpt-4", messages=[{"role": "system", "content": "You are a statistical analyst. You provide statistical descriptions ONLY, never strategic advice. You ALWAYS acknowledge measurement error and data limitations."}, {"role": "user", "content": prompt}], temperature=0.3, max_tokens=2000 ) profiles[brand] = response.choices[0].message.content.strip() logger.info(f"Generated statistical profiles for {len(profiles)} brands") return profiles except Exception as e: logger.warning(f"Statistical profile generation failed: {e}") return {}
def _prepare_data_summary( self, df: pd.DataFrame, period_metrics: pd.DataFrame, market_concentration: Dict[str, Any] ) -> str: """Prepare data summary for AI analysis.""" summary_parts = [] # Period info start_date = df['date'].min().strftime('%Y-%m-%d') end_date = df['date'].max().strftime('%Y-%m-%d') summary_parts.append(f"Period: {start_date} to {end_date}") # Market concentration hhi = market_concentration.get('hhi', 0) concentration = market_concentration.get('concentration', 'unknown') summary_parts.append(f"\nMarket Concentration: HHI={hhi:.0f} ({concentration})") # Brand metrics summary_parts.append("\nBrand Performance:") for _, row in period_metrics.iterrows(): brand = row['query'] avg_share = row['avg_share'] volatility = row.get('volatility', 0) summary_parts.append(f"- {brand}: {avg_share:.1f}% avg share, volatility={volatility:.2f}") # Trends summary_parts.append("\nTrends:") for query in df['query'].unique(): query_data = df[df['query'] == query] if len(query_data) > 0: trend = query_data['trend_direction'].iloc[-1] slope = query_data['trend_slope'].iloc[-1] summary_parts.append(f"- {query}: {trend} (slope={slope:.3f})") return "\n".join(summary_parts) def _fallback_executive_summary(self, period_metrics: pd.DataFrame) -> str: """Generate fallback summary without AI.""" # Find leader leader = period_metrics.loc[period_metrics['avg_share'].idxmax()] summary = f"""Share of Search Analysis Summary Market Leader: {leader['query']} with {leader['avg_share']:.1f}% average share. The analysis covers {len(period_metrics)} brands/queries over the specified period. """ if len(period_metrics) > 1: second = period_metrics.nlargest(2, 'avg_share').iloc[1] summary += f"Second place: {second['query']} at {second['avg_share']:.1f}% share. " summary += "\n\nFor detailed AI-powered insights, please configure an OpenAI API key." return summary