This capstone project brings together all concepts learned throughout the course in a comprehensive data analysis project.
You are a data analyst at "RetailHub," a growing e-commerce platform that sells consumer electronics. The company has been operating for 3 years and has collected substantial data on customer behavior, sales, and operations. Management wants to understand business performance and identify growth opportunities.
Business Objectives
Available Data Sources
Stakeholder Interviews
Key Business Questions
Project Scope
Success Metrics
-- Initial data assessment for capstone project
WITH data_inventory AS (
-- Assess available data sources and quality
SELECT
'Data Source Assessment' as assessment_type,
table_name,
record_count,
date_range_start,
date_range_end,
-- Data quality metrics
COUNT(CASE WHEN primary_key IS NULL THEN 1 END) as missing_primary_keys,
COUNT(CASE WHEN created_date IS NULL THEN 1 END) as missing_dates,
ROUND(COUNT(CASE WHEN created_date IS NULL THEN 1 END) * 100.0 / record_count, 2) as missing_date_pct,
-- Data completeness
CASE
WHEN record_count > 1000000 THEN 'Large Dataset'
WHEN record_count > 100000 THEN 'Medium Dataset'
ELSE 'Small Dataset'
END as dataset_size,
-- Data freshness
CASE
WHEN date_range_end >= CURRENT_DATE - INTERVAL '7 days' THEN 'Fresh'
WHEN date_range_end >= CURRENT_DATE - INTERVAL '30 days' THEN 'Recent'
ELSE 'Stale'
END as data_freshness
FROM (
SELECT
'customers' as table_name,
COUNT(*) as record_count,
MIN(created_date) as date_range_start,
MAX(created_date) as date_range_end,
customer_id as primary_key,
created_date
FROM customers
UNION ALL
SELECT
'orders' as table_name,
COUNT(*) as record_count,
MIN(order_date) as date_range_start,
MAX(order_date) as date_range_end,
order_id as primary_key,
order_date as created_date
FROM orders
UNION ALL
SELECT
'marketing_campaigns' as table_name,
COUNT(*) as record_count,
MIN(start_date) as date_range_start,
MAX(end_date) as date_range_end,
campaign_id as primary_key,
start_date as created_date
FROM marketing_campaigns
) data_sources
GROUP BY table_name, record_count, date_range_start, date_range_end
),
project_feasibility AS (
-- Assess project feasibility based on data availability
SELECT
'Project Feasibility' as assessment_type,
-- Data availability for key analyses
CASE
WHEN COUNT(CASE WHEN table_name IN ('customers', 'orders') THEN 1 END) = 2 THEN 'Customer Analysis: Feasible'
ELSE 'Customer Analysis: Limited Data'
END as customer_analysis_feasibility,
CASE
WHEN COUNT(CASE WHEN table_name = 'marketing_campaigns' THEN 1 END) = 1 THEN 'Marketing Analysis: Feasible'
ELSE 'Marketing Analysis: Limited Data'
END as marketing_analysis_feasibility,
-- Overall project risk
CASE
WHEN COUNT(CASE WHEN missing_date_pct > 5 THEN 1 END) > 0 THEN 'High Risk - Data Quality Issues'
WHEN COUNT(CASE WHEN data_freshness = 'Stale' THEN 1 END) > 0 THEN 'Medium Risk - Stale Data'
ELSE 'Low Risk - Good Data Quality'
END as overall_risk_level,
-- Recommended approach
CASE
WHEN COUNT(CASE WHEN missing_date_pct > 5 THEN 1 END) > 0 THEN 'Data Cleaning Required Before Analysis'
WHEN COUNT(CASE WHEN data_freshness = 'Stale' THEN 1 END) > 0 THEN 'Consider Data Update Strategy'
ELSE 'Proceed with Standard Analysis Approach'
END as recommended_approach
FROM data_inventory
)
SELECT
assessment_type,
table_name,
record_count,
date_range_start,
date_range_end,
missing_date_pct,
dataset_size,
data_freshness,
customer_analysis_feasibility,
marketing_analysis_feasibility,
overall_risk_level,
recommended_approach
FROM data_inventory
UNION ALL
SELECT
assessment_type,
NULL as table_name,
NULL as record_count,
NULL as date_range_start,
NULL as date_range_end,
NULL as missing_date_pct,
NULL as dataset_size,
NULL as data_freshness,
customer_analysis_feasibility,
marketing_analysis_feasibility,
overall_risk_level,
recommended_approach
FROM project_feasibility
ORDER BY assessment_type, table_name;
Data Collection Strategy
Data Integration Challenges
Data Cleaning Tasks
Quality Assurance Checks
-- Data cleaning and preparation for analysis
WITH customer_data_cleaning AS (
-- Clean and standardize customer data
SELECT
customer_id,
-- Standardize name formats
UPPER(TRIM(first_name)) as first_name_clean,
UPPER(TRIM(last_name)) as last_name_clean,
-- Clean and validate email
LOWER(TRIM(email)) as email_clean,
CASE
WHEN email LIKE '%_@__%.__%' THEN 'Valid'
ELSE 'Invalid'
END as email_status,
-- Standardize phone numbers
REGEXP_REPLACE(phone, '[^0-9]', '') as phone_clean,
CASE
WHEN LENGTH(REGEXP_REPLACE(phone, '[^0-9]', '')) = 10 THEN 'Valid'
ELSE 'Invalid'
END as phone_status,
-- Handle missing dates
COALESCE(created_date, registration_date, '1900-01-01') as customer_since_date,
-- Age calculation
CASE
WHEN date_of_birth IS NOT NULL
THEN DATE_PART('year', AGE(CURRENT_DATE, date_of_birth))
ELSE NULL
END as age,
-- Data quality flags
CASE
WHEN email NOT LIKE '%_@__%.__%' OR phone IS NULL THEN 'Review Required'
WHEN created_date IS NULL THEN 'Date Issue'
ELSE 'Clean'
END as data_quality_status
FROM customers_raw
),
order_data_cleaning AS (
-- Clean and enrich order data
SELECT
order_id,
customer_id,
-- Standardize dates
COALESCE(order_date, created_date) as order_date_clean,
-- Validate monetary values
CASE
WHEN total_amount > 0 THEN total_amount
ELSE NULL
END as order_amount_clean,
-- Calculate order metrics
COUNT(DISTINCT order_item_id) as item_count,
SUM(quantity) as total_quantity,
AVG(unit_price) as avg_item_price,
-- Order status classification
CASE
WHEN status IN ('delivered', 'completed') THEN 'Completed'
WHEN status IN ('cancelled', 'refunded') THEN 'Cancelled'
WHEN status IN ('processing', 'pending') THEN 'In Progress'
ELSE 'Unknown'
END as order_status_clean,
-- Data quality indicators
CASE
WHEN total_amount <= 0 THEN 'Invalid Amount'
WHEN customer_id IS NULL THEN 'Missing Customer'
WHEN order_date IS NULL THEN 'Missing Date'
ELSE 'Clean'
END as data_quality_flag
FROM orders_raw
GROUP BY order_id, customer_id, total_amount, status, order_date, created_date
),
data_quality_summary AS (
-- Summarize data quality results
SELECT
'Data Quality Summary' as summary_type,
'Customers' as data_source,
COUNT(*) as total_records,
COUNT(CASE WHEN data_quality_status = 'Clean' THEN 1 END) as clean_records,
COUNT(CASE WHEN data_quality_status != 'Clean' THEN 1 END) as needs_review,
ROUND(COUNT(CASE WHEN data_quality_status = 'Clean' THEN 1 END) * 100.0 / COUNT(*), 2) as data_quality_pct,
-- Specific issues
COUNT(CASE WHEN email_status = 'Invalid' THEN 1 END) as invalid_emails,
COUNT(CASE WHEN phone_status = 'Invalid' THEN 1 END) as invalid_phones,
COUNT(CASE WHEN created_date IS NULL THEN 1 END) as missing_dates
FROM customer_data_cleaning
UNION ALL
SELECT
'Data Quality Summary' as summary_type,
'Orders' as data_source,
COUNT(*) as total_records,
COUNT(CASE WHEN data_quality_flag = 'Clean' THEN 1 END) as clean_records,
COUNT(CASE WHEN data_quality_flag != 'Clean' THEN 1 END) as needs_review,
ROUND(COUNT(CASE WHEN data_quality_flag = 'Clean' THEN 1 END) * 100.0 / COUNT(*), 2) as data_quality_pct,
-- Specific issues
COUNT(CASE WHEN data_quality_flag = 'Invalid Amount' THEN 1 END) as invalid_amounts,
COUNT(CASE WHEN data_quality_flag = 'Missing Customer' THEN 1 END) as missing_customers,
COUNT(CASE WHEN data_quality_flag = 'Missing Date' THEN 1 END) as missing_dates
FROM order_data_cleaning
)
SELECT
summary_type,
data_source,
total_records,
clean_records,
needs_review,
data_quality_pct,
invalid_emails as email_issues,
invalid_phones as phone_issues,
missing_dates,
-- Quality assessment
CASE
WHEN data_quality_pct >= 95 THEN 'Excellent'
WHEN data_quality_pct >= 90 THEN 'Good'
WHEN data_quality_pct >= 80 THEN 'Acceptable'
ELSE 'Poor - Requires Cleaning'
END as quality_rating,
-- Recommended actions
CASE
WHEN data_quality_pct >= 95 THEN 'Proceed with Analysis'
WHEN data_quality_pct >= 90 THEN 'Minor Cleaning Required'
WHEN data_quality_pct >= 80 THEN 'Significant Cleaning Required'
ELSE 'Major Data Quality Issues - Address First'
END as recommended_action
FROM data_quality_summary
ORDER BY data_source;
RFM Analysis
Behavioral Segmentation
Channel Performance
Campaign Analysis
-- Customer segmentation using RFM analysis
WITH customer_rfm AS (
-- Calculate RFM metrics for each customer
SELECT
customer_id,
-- Recency: Days since last purchase
DATEDIFF(CURRENT_DATE, MAX(order_date)) as recency_days,
-- Frequency: Number of orders in last 12 months
COUNT(DISTINCT CASE WHEN order_date >= CURRENT_DATE - INTERVAL '12 months' THEN order_id END) as frequency_12m,
-- Monetary: Total spend in last 12 months
SUM(CASE WHEN order_date >= CURRENT_DATE - INTERVAL '12 months' THEN total_amount END) as monetary_12m,
-- Additional metrics
AVG(total_amount) as avg_order_value,
MIN(order_date) as first_purchase_date,
MAX(order_date) as last_purchase_date,
COUNT(DISTINCT order_id) as total_orders,
SUM(total_amount) as total_lifetime_value
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE order_status = 'Completed'
GROUP BY customer_id
),
rfm_scores AS (
-- Assign RFM scores (1-5, where 5 is best)
SELECT
customer_id,
recency_days,
frequency_12m,
monetary_12m,
-- Recency score (lower recency = higher score)
CASE
WHEN recency_days <= 30 THEN 5
WHEN recency_days <= 60 THEN 4
WHEN recency_days <= 90 THEN 3
WHEN recency_days <= 180 THEN 2
ELSE 1
END as recency_score,
-- Frequency score (higher frequency = higher score)
CASE
WHEN frequency_12m >= 10 THEN 5
WHEN frequency_12m >= 7 THEN 4
WHEN frequency_12m >= 4 THEN 3
WHEN frequency_12m >= 2 THEN 2
ELSE 1
END as frequency_score,
-- Monetary score (higher monetary = higher score)
CASE
WHEN monetary_12m >= 1000 THEN 5
WHEN monetary_12m >= 500 THEN 4
WHEN monetary_12m >= 250 THEN 3
WHEN monetary_12m >= 100 THEN 2
ELSE 1
END as monetary_score,
avg_order_value,
total_lifetime_value
FROM customer_rfm
),
customer_segments AS (
-- Define customer segments based on RFM scores
SELECT
customer_id,
recency_days,
frequency_12m,
monetary_12m,
recency_score,
frequency_score,
monetary_score,
-- Overall RFM score
recency_score + frequency_score + monetary_score as rfm_total_score,
-- Segment classification
CASE
WHEN recency_score >= 4 AND frequency_score >= 4 AND monetary_score >= 4 THEN 'Champions'
WHEN recency_score >= 3 AND frequency_score >= 3 AND monetary_score >= 3 THEN 'Loyal Customers'
WHEN recency_score >= 4 AND frequency_score <= 2 THEN 'New Customers'
WHEN recency_score <= 2 AND frequency_score >= 3 THEN 'At Risk'
WHEN recency_score <= 2 AND frequency_score <= 2 AND monetary_score >= 3 THEN 'Cannot Lose Them'
WHEN recency_score <= 2 AND frequency_score <= 2 AND monetary_score <= 2 THEN 'Lost'
ELSE 'Others'
END as customer_segment,
-- Segment characteristics
CASE
WHEN recency_score >= 4 AND frequency_score >= 4 AND monetary_score >= 4 THEN 'Best customers - recent, frequent, high spenders'
WHEN recency_score >= 3 AND frequency_score >= 3 AND monetary_score >= 3 THEN 'Good customers - regular purchasers'
WHEN recency_score >= 4 AND frequency_score <= 2 THEN 'Recently acquired - need nurturing'
WHEN recency_score <= 2 AND frequency_score >= 3 THEN 'Previously good customers - declining'
WHEN recency_score <= 2 AND frequency_score <= 2 AND monetary_score >= 3 THEN 'High value but inactive - reengage urgently'
WHEN recency_score <= 2 AND frequency_score <= 2 AND monetary_score <= 2 THEN 'Inactive customers - low priority'
ELSE 'Mixed behavior - further analysis needed'
END as segment_description,
avg_order_value,
total_lifetime_value
FROM rfm_scores
),
segment_summary AS (
-- Summarize segment characteristics
SELECT
customer_segment,
segment_description,
COUNT(*) as customer_count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage_of_customers,
ROUND(AVG(recency_days), 1) as avg_recency_days,
ROUND(AVG(frequency_12m), 1) as avg_frequency_12m,
ROUND(AVG(monetary_12m), 2) as avg_monetary_12m,
ROUND(AVG(avg_order_value), 2) as avg_order_value,
ROUND(AVG(total_lifetime_value), 2) as avg_lifetime_value,
ROUND(SUM(total_lifetime_value), 2) as total_segment_value,
ROUND(SUM(total_lifetime_value) * 100.0 / SUM(SUM(total_lifetime_value)) OVER (), 2) as value_percentage
FROM customer_segments
GROUP BY customer_segment, segment_description
ORDER BY avg_lifetime_value DESC
)
SELECT
'Customer Segmentation Analysis' as analysis_type,
customer_segment,
segment_description,
customer_count,
percentage_of_customers,
avg_recency_days,
avg_frequency_12m,
avg_monetary_12m,
avg_order_value,
avg_lifetime_value,
total_segment_value,
value_percentage,
-- Strategic recommendations
CASE
WHEN customer_segment = 'Champions' THEN 'Exclusive offers, early access, loyalty program'
WHEN customer_segment = 'Loyal Customers' THEN 'Cross-sell, upsell, referral programs'
WHEN customer_segment = 'New Customers' THEN 'Onboarding, education, first purchase incentives'
WHEN customer_segment = 'At Risk' THEN 'Re-engagement campaigns, special offers'
WHEN customer_segment = 'Cannot Lose Them' THEN 'Personal outreach, win-back campaigns'
WHEN customer_segment = 'Lost' THEN 'Survey, reactivation offers, or deprioritize'
ELSE 'Further analysis needed for targeted strategy'
END as recommended_strategy,
-- Priority level
CASE
WHEN customer_segment IN ('Champions', 'Cannot Lose Them') THEN 'High Priority'
WHEN customer_segment IN ('Loyal Customers', 'At Risk') THEN 'Medium Priority'
ELSE 'Low Priority'
END as priority_level
FROM segment_summary
ORDER BY total_segment_value DESC;
Customer Lifetime Value Prediction
Churn Prediction
Inventory Optimization
Process Improvement
-- Advanced analytics for business insights
WITH customer_lifetime_value AS (
-- Calculate and predict customer lifetime value
SELECT
customer_id,
-- Historical metrics
COUNT(DISTINCT order_id) as total_orders,
SUM(total_amount) as total_revenue,
AVG(total_amount) as avg_order_value,
DATEDIFF(CURRENT_DATE, MIN(order_date)) as customer_age_days,
DATEDIFF(CURRENT_DATE, MAX(order_date)) as days_since_last_purchase,
-- Behavioral metrics
COUNT(DISTINCT DATE_TRUNC('month', order_date)) as active_months,
-- Predictive metrics
AVG(total_amount) * 12 as predicted_annual_value,
CASE
WHEN days_since_last_purchase <= 90 THEN 'Active'
WHEN days_since_last_purchase <= 180 THEN 'At Risk'
WHEN days_since_last_purchase <= 365 THEN 'Dormant'
ELSE 'Lost'
END as customer_status,
-- CLV calculation (simplified)
AVG(total_amount) * COUNT(DISTINCT order_id) *
CASE
WHEN days_since_last_purchase <= 90 THEN 2.5
WHEN days_since_last_purchase <= 180 THEN 1.8
WHEN days_since_last_purchase <= 365 THEN 1.2
ELSE 0.5
END as predicted_clv
FROM orders
WHERE order_status = 'Completed'
GROUP BY customer_id
),
marketing_attribution AS (
-- Analyze marketing channel effectiveness
SELECT
m.campaign_id,
m.channel,
m.campaign_name,
m.total_spend as marketing_cost,
-- Performance metrics
COUNT(DISTINCT o.customer_id) as acquired_customers,
COUNT(DISTINCT o.order_id) as total_orders,
SUM(o.total_amount) as attributed_revenue,
-- Efficiency metrics
ROUND(SUM(o.total_amount) / m.total_spend, 2) as roas,
ROUND(m.total_spend / COUNT(DISTINCT o.customer_id), 2) as cac,
ROUND(SUM(o.total_amount) / COUNT(DISTINCT o.customer_id), 2) as revenue_per_customer,
-- Quality metrics
AVG(clv.predicted_clv) as avg_predicted_clv,
COUNT(DISTINCT CASE WHEN clv.customer_status = 'Active' THEN o.customer_id END) as active_customers,
-- Performance classification
CASE
WHEN SUM(o.total_amount) / m.total_spend >= 4.0 THEN 'Excellent'
WHEN SUM(o.total_amount) / m.total_spend >= 3.0 THEN 'Good'
WHEN SUM(o.total_amount) / m.total_spend >= 2.0 THEN 'Acceptable'
ELSE 'Poor'
END as performance_rating
FROM marketing_campaigns m
LEFT JOIN orders o ON m.campaign_id = o.attributed_campaign_id
LEFT JOIN customer_lifetime_value clv ON o.customer_id = clv.customer_id
WHERE m.start_date >= CURRENT_DATE - INTERVAL '12 months'
GROUP BY m.campaign_id, m.channel, m.campaign_name, m.total_spend
),
operational_efficiency AS (
-- Analyze operational efficiency metrics
SELECT
'Operational Analysis' as analysis_type,
-- Order processing metrics
COUNT(DISTINCT order_id) as total_orders_processed,
AVG(DATEDIFF(shipped_date, order_date)) as avg_processing_days,
AVG(DATEDIFF(delivered_date, order_date)) as avg_delivery_days,
COUNT(CASE WHEN shipped_date > promised_date THEN 1 END) * 100.0 / COUNT(*) as late_shipment_pct,
-- Inventory metrics
COUNT(DISTINCT product_id) as unique_products_sold,
AVG(stock_turnover_days) as avg_inventory_turnover,
COUNT(CASE WHEN stock_level < reorder_point THEN 1 END) as stockout_count,
-- Customer service metrics
AVG(customer_satisfaction_score) as avg_satisfaction,
COUNT(DISTINCT support_ticket_id) as total_support_tickets,
AVG(ticket_resolution_hours) as avg_resolution_time,
-- Efficiency scores
CASE
WHEN AVG(DATEDIFF(delivered_date, order_date)) <= 2 THEN 'Excellent'
WHEN AVG(DATEDIFF(delivered_date, order_date)) <= 3 THEN 'Good'
WHEN AVG(DATEDIFF(delivered_date, order_date)) <= 5 THEN 'Acceptable'
ELSE 'Poor'
END as delivery_efficiency,
CASE
WHEN AVG(customer_satisfaction_score) >= 4.5 THEN 'Excellent'
WHEN AVG(customer_satisfaction_score) >= 4.0 THEN 'Good'
WHEN AVG(customer_satisfaction_score) >= 3.5 THEN 'Acceptable'
ELSE 'Poor'
END as service_efficiency
FROM operational_metrics
WHERE metric_date >= CURRENT_DATE - INTERVAL '30 days'
)
SELECT
'Advanced Analytics Insights' as report_section,
-- Customer insights
(SELECT COUNT(*) FROM customer_lifetime_value WHERE predicted_clv > 1000) as high_value_customers,
(SELECT AVG(predicted_clv) FROM customer_lifetime_value) as avg_predicted_clv,
(SELECT COUNT(*) FROM customer_lifetime_value WHERE customer_status = 'At Risk') as at_risk_customers,
-- Marketing insights
(SELECT AVG(roas) FROM marketing_attribution) as avg_roas,
(SELECT AVG(cac) FROM marketing_attribution) as avg_cac,
(SELECT COUNT(*) FROM marketing_attribution WHERE performance_rating = 'Excellent') as excellent_campaigns,
-- Operational insights
(SELECT avg_processing_days FROM operational_efficiency) as avg_processing_days,
(SELECT avg_satisfaction FROM operational_efficiency) as avg_satisfaction,
(SELECT late_shipment_pct FROM operational_efficiency) as late_shipment_pct,
-- Strategic recommendations
CASE
WHEN (SELECT AVG(roas) FROM marketing_attribution) >= 3.0 THEN 'Scale successful marketing channels'
WHEN (SELECT AVG(roas) FROM marketing_attribution) >= 2.0 THEN 'Optimize marketing mix'
ELSE 'Review marketing strategy'
END as marketing_recommendation,
CASE
WHEN (SELECT COUNT(*) FROM customer_lifetime_value WHERE customer_status = 'At Risk') >
(SELECT COUNT(*) FROM customer_lifetime_value) * 0.2 THEN 'Urgent retention needed'
WHEN (SELECT COUNT(*) FROM customer_lifetime_value WHERE customer_status = 'At Risk') >
(SELECT COUNT(*) FROM customer_lifetime_value) * 0.1 THEN 'Implement retention programs'
ELSE 'Monitor customer health'
END as customer_recommendation,
CASE
WHEN (SELECT late_shipment_pct FROM operational_efficiency) > 10 THEN 'Address operational bottlenecks'
WHEN (SELECT late_shipment_pct FROM operational_efficiency) > 5 THEN 'Optimize fulfillment process'
ELSE 'Maintain operational efficiency'
END as operational_recommendation;
Key Performance Indicators
Visual Design Elements
Strategic Initiatives
Implementation Roadmap
-- Executive dashboard data preparation
WITH executive_kpis AS (
-- Calculate key performance indicators for executive dashboard
SELECT
'Executive Dashboard' as dashboard_type,
CURRENT_DATE as report_date,
-- Revenue KPIs
SUM(CASE WHEN order_date >= CURRENT_DATE - INTERVAL '30 days' THEN total_amount END) as revenue_30d,
SUM(CASE WHEN order_date >= CURRENT_DATE - INTERVAL '30 days' THEN total_amount END) -
SUM(CASE WHEN order_date >= CURRENT_DATE - INTERVAL '60 days' AND order_date < CURRENT_DATE - INTERVAL '30 days' THEN total_amount END) as revenue_change_30d,
ROUND((SUM(CASE WHEN order_date >= CURRENT_DATE - INTERVAL '30 days' THEN total_amount END) /
SUM(CASE WHEN order_date >= CURRENT_DATE - INTERVAL '60 days' AND order_date < CURRENT_DATE - INTERVAL '30 days' THEN total_amount END) - 1) * 100, 2) as revenue_growth_pct,
-- Customer KPIs
COUNT(DISTINCT CASE WHEN first_order_date >= CURRENT_DATE - INTERVAL '30 days' THEN customer_id END) as new_customers_30d,
COUNT(DISTINCT CASE WHEN last_order_date >= CURRENT_DATE - INTERVAL '30 days' THEN customer_id END) as active_customers_30d,
ROUND(COUNT(DISTINCT CASE WHEN last_order_date >= CURRENT_DATE - INTERVAL '30 days' THEN customer_id END) * 100.0 /
COUNT(DISTINCT CASE WHEN last_order_date >= CURRENT_DATE - INTERVAL '90 days' THEN customer_id END), 2) as retention_rate_30d,
-- Marketing KPIs
SUM(CASE WHEN campaign_date >= CURRENT_DATE - INTERVAL '30 days' THEN campaign_spend END) as marketing_spend_30d,
SUM(CASE WHEN campaign_date >= CURRENT_DATE - INTERVAL '30 days' THEN attributed_revenue END) as attributed_revenue_30d,
ROUND(SUM(CASE WHEN campaign_date >= CURRENT_DATE - INTERVAL '30 days' THEN attributed_revenue END) /
NULLIF(SUM(CASE WHEN campaign_date >= CURRENT_DATE - INTERVAL '30 days' THEN campaign_spend END), 0), 2) as roas_30d,
-- Operational KPIs
AVG(CASE WHEN order_date >= CURRENT_DATE - INTERVAL '30 days' THEN processing_hours END) as avg_processing_hours_30d,
COUNT(CASE WHEN order_date >= CURRENT_DATE - INTERVAL '30 days' AND late_delivery = 1 THEN 1 END) * 100.0 /
COUNT(CASE WHEN order_date >= CURRENT_DATE - INTERVAL '30 days' THEN 1 END) as on_time_delivery_pct_30d
FROM (
SELECT
o.order_id,
o.customer_id,
o.order_date,
o.total_amount,
o.processing_hours,
o.late_delivery,
c.first_order_date,
c.last_order_date,
m.campaign_date,
m.campaign_spend,
m.attributed_revenue
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
LEFT JOIN marketing_attribution m ON o.order_id = m.order_id
WHERE o.order_date >= CURRENT_DATE - INTERVAL '90 days'
) combined_data
),
performance_trends AS (
-- Calculate performance trends for trend analysis
SELECT
DATE_TRUNC('week', order_date) as week_start,
SUM(total_amount) as weekly_revenue,
COUNT(DISTINCT customer_id) as weekly_customers,
AVG(total_amount) as avg_order_value,
COUNT(DISTINCT order_id) as weekly_orders,
-- Week-over-week growth
LAG(SUM(total_amount)) OVER (ORDER BY DATE_TRUNC('week', order_date)) as prev_week_revenue,
ROUND((SUM(total_amount) - LAG(SUM(total_amount)) OVER (ORDER BY DATE_TRUNC('week', order_date))) /
LAG(SUM(total_amount)) OVER (ORDER BY DATE_TRUNC('week', order_date)) * 100, 2) as wow_revenue_growth
FROM orders
WHERE order_date >= CURRENT_DATE - INTERVAL '12 weeks'
GROUP BY DATE_TRUNC('week', order_date)
),
alert_system AS (
-- Generate alerts for key metrics
SELECT
'Alert System' as alert_type,
-- Revenue alerts
CASE
WHEN (SELECT revenue_growth_pct FROM executive_kpis) < -10 THEN 'Critical: Revenue declining rapidly'
WHEN (SELECT revenue_growth_pct FROM executive_kpis) < -5 THEN 'Warning: Revenue decline detected'
WHEN (SELECT revenue_growth_pct FROM executive_kpis) > 15 THEN 'Info: Strong revenue growth'
ELSE NULL
END as revenue_alert,
-- Customer alerts
CASE
WHEN (SELECT retention_rate_30d FROM executive_kpis) < 70 THEN 'Critical: Low customer retention'
WHEN (SELECT retention_rate_30d FROM executive_kpis) < 80 THEN 'Warning: Retention below target'
ELSE NULL
END as customer_alert,
-- Marketing alerts
CASE
WHEN (SELECT roas_30d FROM executive_kpis) < 2.0 THEN 'Critical: Poor marketing ROAS'
WHEN (SELECT roas_30d FROM executive_kpis) < 3.0 THEN 'Warning: ROAS below target'
ELSE NULL
END as marketing_alert,
-- Operational alerts
CASE
WHEN (SELECT on_time_delivery_pct_30d FROM executive_kpis) < 85 THEN 'Critical: Poor delivery performance'
WHEN (SELECT on_time_delivery_pct_30d FROM executive_kpis) < 95 THEN 'Warning: Delivery below target'
ELSE NULL
END as operational_alert
FROM dual
)
SELECT
'Executive Dashboard Data' as data_type,
report_date,
revenue_30d,
revenue_growth_pct,
new_customers_30d,
active_customers_30d,
retention_rate_30d,
marketing_spend_30d,
roas_30d,
avg_processing_hours_30d,
on_time_delivery_pct_30d,
-- Performance indicators
CASE
WHEN revenue_growth_pct > 10 THEN 'Strong Growth'
WHEN revenue_growth_pct > 0 THEN 'Growth'
WHEN revenue_growth_pct > -5 THEN 'Stable'
ELSE 'Declining'
END as revenue_trend,
CASE
WHEN retention_rate_30d >= 85 THEN 'Excellent'
WHEN retention_rate_30d >= 80 THEN 'Good'
WHEN retention_rate_30d >= 75 THEN 'Acceptable'
ELSE 'Poor'
END as retention_performance,
CASE
WHEN roas_30d >= 4.0 THEN 'Excellent'
WHEN roas_30d >= 3.0 THEN 'Good'
WHEN roas_30d >= 2.0 THEN 'Acceptable'
ELSE 'Poor'
END as marketing_performance,
-- Priority actions
CASE
WHEN revenue_growth_pct < -5 OR retention_rate_30d < 75 OR roas_30d < 2.0 THEN 'Immediate Action Required'
WHEN revenue_growth_pct < 0 OR retention_rate_30d < 80 OR roas_30d < 3.0 THEN 'Review Required'
ELSE 'Monitor Performance'
END as action_priority,
-- Key alerts
(SELECT revenue_alert FROM alert_system WHERE revenue_alert IS NOT NULL) as revenue_alert,
(SELECT customer_alert FROM alert_system WHERE customer_alert IS NOT NULL) as customer_alert,
(SELECT marketing_alert FROM alert_system WHERE marketing_alert IS NOT NULL) as marketing_alert,
(SELECT operational_alert FROM alert_system WHERE operational_alert IS NOT NULL) as operational_alert
FROM executive_kpis;
Presentation Structure
Executive Summary (2 slides)
Current State Analysis (3 slides)
Strategic Recommendations (4 slides)
Implementation Plan (2 slides)
Technical Documentation
Business Documentation
You have completed the Data Analyst Fundamentals course! This capstone project demonstrates your ability to apply all the concepts learned throughout the course to solve real business problems. You now have the skills and portfolio pieces needed to pursue a career in data analytics.
The journey in data analytics is ongoing, and you now have the foundation to grow into a senior data analyst, business intelligence specialist, or data scientist role.