In [ ]:
from google.colab import drive
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns



# Environment Set up
drive.mount('/content/drive')
%cd /content/drive/MyDrive/dataset_final_project/


# ============================================================
# Function: Load CSV file with safety checks
# ============================================================

def load_csv(filepath, name="Dataset"):
    """Load a CSV file with error handling"""
    try:
        df = pd.read_csv(filepath)
        print(f"[INFO] {name} loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")
        return df
    except FileNotFoundError:
        print(f"[ERROR] File not found: {filepath}")
        return pd.DataFrame()
    except Exception as e:
        print(f"[ERROR] Failed to load {name}: {e}")
        return pd.DataFrame()


# ============================================================
# Function: Clean Events dataset
# ============================================================

def clean_events(df_events, df_countries):
    """Clean events dataset: handle missing values and normalize text fields"""

    # Count missing values before cleaning
    missing_country = df_events["Country Code"].isna().sum()
    print(f"[INFO] Missing Country Code before cleaning: {missing_country} rows ({missing_country/len(df_events):.2%})")

    # Handle missing Country Code
    df_events["Country Code"] = df_events["Country Code"].fillna("UNK")
    if missing_country > 0:
        df_countries = pd.concat([
            df_countries,
            pd.DataFrame({
                "alpha-3": ["UNK"],
                "name": ["Unknown"],
                "region": ["Unknown"],
                "sub-region": ["Unknown"]
            })
        ], ignore_index=True)
        print(f"[INFO] Added 'UNK' as placeholder for {missing_country} missing rows.")

    # Drop rows with missing Units Sold
    missing_units = df_events["Units Sold"].isna().sum()
    df_events = df_events.dropna(subset=["Units Sold"])
    print(f"[INFO] Dropped {missing_units} rows with missing Units Sold.")

    # Normalize Sales Channel and Order Priority
    df_events.loc[:, "Sales Channel"] = df_events["Sales Channel"].str.strip().str.capitalize()
    df_events.loc[:, "Order Priority"] = df_events["Order Priority"].str.strip()


    print(f"[INFO] Dataset cleaned. Final shape: {df_events.shape}")
    return df_events, df_countries

# ============================================================
# Function: Merge & Transform Data
# ============================================================
def merge_and_transform(df_events, df_countries, df_products):
    """Merge 3 datasets and perform feature engineering for analysis."""

    # --- Merge 3 DataFrames into 1
    df_merged = (
        df_events
        .merge(df_countries, left_on="Country Code", right_on="alpha-3")
        .merge(df_products, left_on="Product ID", right_on="id")
    )
    print(f"[INFO] Shape after merging: {df_merged.shape}")

    # --- Rename columns for readability
    df_merged = df_merged.rename(columns={
        'item_type': 'Product Name',
        'name': 'Country Name',
        'region': 'Country Region',
        'sub-region': 'Country Sub-Region'
    })

    # --- Drop repetitive/unnecessary columns
    drop_cols = ['alpha-2', 'alpha-3', 'id']
    df_merged = df_merged.drop(columns=drop_cols, errors="ignore")
    print(f"[INFO] Dropped columns: {drop_cols}")

    # --- Update date columns to datetime
    df_merged["Order Date"] = pd.to_datetime(df_merged["Order Date"], errors="coerce")
    df_merged["Ship Date"] = pd.to_datetime(df_merged["Ship Date"], errors="coerce")

    # --- Add Year, Quarter, Fulfillment Days
    df_merged["Year"] = df_merged["Order Date"].dt.year
    df_merged["Quarter"] = df_merged["Order Date"].dt.to_period('Q')
    df_merged["Fulfillment Days"] = (
        df_merged["Ship Date"] - df_merged["Order Date"]
    ).dt.days

    # --- Calculate Profit and Revenue
    df_merged["Profit"] = df_merged["Unit Price"] - df_merged["Unit Cost"]
    df_merged["Profit Margin"] = (df_merged["Unit Price"] - df_merged["Unit Cost"]) / df_merged["Unit Price"] * 100
    df_merged["Total Profit"] = df_merged["Profit"] * df_merged["Units Sold"]
    df_merged["Total Revenue"] = df_merged["Unit Price"] * df_merged["Units Sold"]

    print("[INFO] Transformation complete. Final shape:", df_merged.shape)
    return df_merged


# ============================================================
# Data Visualization Functions
# ============================================================

def plot_profit_by_region(df):
    """Bar chart: Total Profit per Region."""
    plt.figure(figsize=(8, 6))
    ax = plt.gca()
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x/1e6:.1f}M'))

    (
        df.groupby("Country Region")["Total Profit"]
        .sum()
        .sort_values(ascending=False)
        .plot(kind="bar")
    )

    plt.title("Total Profit per Region")
    plt.xticks(rotation=45)
    plt.show()
    print("[INFO] Displayed bar chart for Total Profit by Region.")


def plot_fulfillment_vs_margin(df):
    """Scatter + regression: Fulfillment Days vs Profit Margin (Europe vs Asia)."""
    df_region = df[df["Country Region"].isin(["Europe", "Asia"])]

    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        data=df_region,
        x="Fulfillment Days",
        y="Profit Margin",
        hue="Country Region",
        alpha=0.7
    )

    # Trendlines per region
    for region, color in [("Europe", "blue"), ("Asia", "orange")]:
        sns.regplot(
            data=df_region[df_region["Country Region"] == region],
            x="Fulfillment Days",
            y="Profit Margin",
            scatter=False,
            color=color,
            label=f"{region} Trend"
        )

    plt.title("Fulfillment Days vs Profit Margin (Europe vs Asia)")
    plt.xlabel("Fulfillment Days")
    plt.ylabel("Profit Margin (%)")
    plt.legend()
    plt.grid(True)
    plt.show()
    print("[INFO] Displayed scatter + regression for Fulfillment Days vs Profit Margin.")


def plot_profit_trend_by_priority(df):
    """Line chart: Profit trend per Order Priority across years."""
    order_priority = df["Order Priority"].unique()

    priority_trend = (
        df.groupby(["Order Priority", "Year"])["Total Profit"]
        .sum()
        .reset_index()
    )

    priority_mapping = {
        "M": "Medium",
        "C": "Critical",
        "L": "Low",
        "H": "High"
    }

    plt.figure(figsize=(12, 6))
    ax = plt.gca()
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x/1e6:.1f}M'))

    for priority in order_priority:
        priority_data = priority_trend[priority_trend["Order Priority"] == priority]
        plt.plot(
            priority_data["Year"],
            priority_data["Total Profit"],
            label=priority_mapping.get(priority, priority)
        )

    plt.xlabel("Year")
    plt.ylabel("Profit")
    plt.title("Profit Trend by Order Priority")
    plt.legend()
    plt.grid(True)
    plt.show()
    print("[INFO] Displayed line chart for Profit Trend by Order Priority.")


def plot_avg_profit_per_quarter(df):
    """Bar chart: Average Profit per Quarter (2010–2017)."""
    quarterly_profit = df.groupby('Quarter')['Total Profit'].sum().reset_index()
    quarterly_profit['Quarter'] = quarterly_profit['Quarter'].dt.to_timestamp()
    quarterly_profit['Quarter_Num'] = quarterly_profit['Quarter'].dt.quarter

    avg_quarter = quarterly_profit.groupby('Quarter_Num')['Total Profit'].mean().reset_index()

    plt.figure(figsize=(8, 5))
    plt.bar(
        avg_quarter['Quarter_Num'],
        avg_quarter['Total Profit'],
        color=['blue','green','orange','red']
    )
    plt.title("Average Profit per Quarter (2010–2017)")
    plt.xlabel("Quarter")
    plt.ylabel("Average Profit")
    plt.xticks([1,2,3,4], ["Q1","Q2","Q3","Q4"])
    plt.show()
    print("[INFO] Displayed bar chart for Average Profit per Quarter.")


def plot_europe_subregion_distribution(df):
    """Pie chart: Order distribution across Europe Sub-Regions."""
    df_europe = df[df["Country Region"] == "Europe"]

    europe_sub_region = (
        df_europe.groupby("Country Sub-Region")["Order ID"]
        .count()
        .sort_values(ascending=False)
    )

    plt.figure(figsize=(8, 6))
    europe_sub_region.plot(kind="pie", autopct="%1.1f%%", startangle=140)
    plt.title("Order Distribution of Europe Sub-Region")
    plt.ylabel("")
    plt.show()
    print("[INFO] Displayed pie chart for Order Distribution in Europe Sub-Region.")

def plot_sales_channel_revenue_trend(df):
  """Line chart: Revenue Trend by Sales Channel"""
  sales_channels = df["Sales Channel"].unique()

  sales_trend = (
      df.groupby(["Sales Channel", "Year"])["Total Revenue"].sum().reset_index()
  )

  plt.figure(figsize=(12, 6))
  ax = plt.gca()
  ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f'{x/1e6:.1f}M'))

  for channel in sales_channels:
      channel_data = sales_trend[sales_trend["Sales Channel"] == channel]
      plt.plot(channel_data["Year"], channel_data["Total Revenue"], label=channel)

  plt.xlabel("Year")
  plt.ylabel("Revenue")
  plt.title("Revenue Trend by Sales Channel")
  plt.legend()
  plt.grid(True)
  plt.show()

# ============================================================
# Main Function
# ============================================================

df_countries = load_csv("countries.csv", "Countries")
df_events = load_csv("events.csv", "Events")
df_products = load_csv("products.csv", "Products")

df_events, df_countries = clean_events(df_events, df_countries)
df_merged = merge_and_transform(df_events, df_countries, df_products)

plot_profit_by_region(df_merged)
plot_fulfillment_vs_margin(df_merged)
plot_profit_trend_by_priority(df_merged)
plot_avg_profit_per_quarter(df_merged)
plot_europe_subregion_distribution(df_merged)
plot_sales_channel_revenue_trend(df_merged)
Mounted at /content/drive
/content/drive/MyDrive/dataset_final_project
[INFO] Countries loaded successfully with 249 rows and 5 columns.
[INFO] Events loaded successfully with 1330 rows and 10 columns.
[INFO] Products loaded successfully with 12 rows and 2 columns.
[INFO] Missing Country Code before cleaning: 82 rows (6.17%)
[INFO] Added 'UNK' as placeholder for 82 missing rows.
[INFO] Dropped 2 rows with missing Units Sold.
[INFO] Dataset cleaned. Final shape: (1328, 10)
[INFO] Shape after merging: (1328, 17)
[INFO] Dropped columns: ['alpha-2', 'alpha-3', 'id']
[INFO] Transformation complete. Final shape: (1328, 21)
No description has been provided for this image
[INFO] Displayed bar chart for Total Profit by Region.
No description has been provided for this image
[INFO] Displayed scatter + regression for Fulfillment Days vs Profit Margin.
No description has been provided for this image
[INFO] Displayed line chart for Profit Trend by Order Priority.
No description has been provided for this image
[INFO] Displayed bar chart for Average Profit per Quarter.
No description has been provided for this image
[INFO] Displayed pie chart for Order Distribution in Europe Sub-Region.
No description has been provided for this image

📊 Sales Data Analysis (2010–2017)¶

1. Executive Summary¶

This analysis provides a comprehensive view of company performance across regions, order priorities, fulfillment efficiency, seasonal profitability, European sub-regional trends, and sales channel dynamics.

Key highlights:

  • Europe dominates profitability, with Southern Europe driving most of the demand.
  • Asia remains underpenetrated, representing a long-term growth opportunity.
  • Faster fulfillment directly correlates with higher margins, underscoring the importance of supply chain efficiency.
  • Q1 and Q2 deliver the highest total profit, while Q3 lags, confirming seasonal strength early in the year.
  • Critical and High priority orders generate disproportionate profit, highlighting where focus should be maintained.
  • Western Europe lags in order share, suggesting either market saturation or lack of investment compared to other regions.
  • Offline sales outperform online historically but are volatile, while online sales show stability but recent decline, signaling the need for balanced channel strategies.

Overall, the company should reinforce strengths in Europe, address underperformance in Asia and Western Europe, realign seasonal marketing efforts to Q1–Q2, and rebalance sales channel strategies by stabilizing offline performance while revitalizing online growth.


2. Introduction¶

This project analyzes global sales data from 2010 to 2017.

The dataset includes information such as:

  • Order date and region (Europe, Asia, Unknown)
  • Sales channels (Online & Offline) and order priority (Critical, High, Medium, Low)
  • Fulfillment time (in days)
  • Profit margin (in percentage)

Objective:
To uncover trends and insights that can guide strategic business decisions regarding regional performance, product seasonality, and operational efficiency.


3. Business Questions¶

The analysis aims to answer the following key questions:

  1. 📈 How does sales performance differ between Europe and Asia?
  2. 💰 What is the relationship between order fulfillment time and profit margin?
  3. 🔥 Which order priority contributes most to overall profitability?
  4. 🕒 Are there any seasonal trends in sales or profit across quarters and years?
  5. 🌍 How is the distribution of orders across sub-regions within Europe?
  6. 📈 What is the relationship between Sales Channel and Total Revenue?

4. Analysis & Visualizations¶

Q1. How does sales performance differ between Europe and Asia?¶

Visualization:
Diagram 1.jpg

Analysis:

Analysis: Total Profit per Region¶

The bar chart clearly illustrates that Europe dominates total profit, generating more than 20 times the profit of Asia.

  • Europe: The largest contributor by far, representing the core market.
  • Unknown: Surprisingly, this category outperforms Asia. This indicates a data quality issue in country codes and should be investigated further.
  • Asia: Profit contribution is minimal, showing limited market penetration.

Key Insights:

  1. Europe is the company's primary profit driver.
  2. The relatively high “Unknown” category suggests issues in data entry or system integration.
  3. Asia remains underdeveloped compared to Europe, representing a potential growth opportunity if properly addressed.

Business Questions to Explore Further:

  • Why is Asia underperforming compared to Europe?
  • What specific countries are included in “Unknown”? Could they be reassigned to the correct region?
  • Should the company focus on strengthening its core (Europe), or diversify into Asia to balance regional risk?

Q2. What is the relationship between order fulfillment time and profit margin?¶

Visualization:
Diagram 2.jpg

Analysis:

Analysis: Fulfillment Days vs Profit Margin¶

The scatter plot shows the relationship between fulfillment days and profit margin, segmented by Europe and Asia.

  • Both regions demonstrate relatively stable profit margins, mostly concentrated between 60–70%, regardless of the delivery time.
  • The trendlines for Europe and Asia are almost flat, indicating no strong correlation between delivery speed and profitability.

Key Insight:
In this dataset, fulfillment speed does not appear to be a major driver of profit margin. Other factors such as product category, sales channel, or regional demand may have stronger influence.


Q3. Which order priority contributes most to overall profitability?¶

Visualization:
Diagram 3.jpg

Analysis:

Analysis: Profit Trend by Order Priority (2010–2017)¶

The chart highlights significant differences across order priorities:

  • 2012 Peak: All order priorities reached their highest profit levels in 2012.
  • High Priority Orders: Initially strong, peaking in 2012, but showing a sharp decline afterward, hitting the lowest level by 2017.
  • Medium Priority Orders: The most stable category, with relatively consistent profit compared to others, though also declining in the final years.
  • Critical Priority Orders: Started strong in 2010 but dropped significantly by 2013 and showed a downward trend overall.
  • Low Priority Orders: Consistently weaker, except in 2016 when it briefly outperformed both High and Critical priorities.

Key Insights:

  1. 2012 was a boom year for all categories.
  2. High Priority orders are no longer profitable by 2017, raising concerns about efficiency or cost structure.
  3. Medium Priority is the most resilient and reliable.
  4. Low Priority, while usually weak, still managed to outperform in 2016.

Business Questions:

  • Why did all categories peak in 2012? Was there a special campaign, market expansion, or external factor?
  • Why are High Priority orders declining so sharply—are costs too high compared to revenue?
  • Should the company reconsider how it handles Critical and High Priority orders?

Q4. Are there any seasonal trends in sales or profit across quarters and years?¶

Visualization:
Diagram 4.jpg

Analysis:

Analysis: Average Total Profit per Quarter (2010–2017)¶

The updated bar chart shows that Q1 delivers the highest average total profit, followed by Q2, while Q3 lags behind and Q4 partially recovers.

  • Q1: Strongest quarter, with total profit reaching close to 18M.
  • Q2: Solid performance, averaging around 16M, slightly below Q1.
  • Q3: Weakest quarter, at around 15M, indicating softer demand or rising costs mid-year.
  • Q4: Shows a rebound above Q3 (around 15.8M), but still below Q1–Q2.

Key Insights:

  1. Total profitability peaks early in the year (Q1–Q2), confirming strong front-loaded demand cycles.
  2. Q3 consistently underperforms, suggesting either seasonal slowdown or operational bottlenecks.
  3. Q4 does not match Q1–Q2 strength, indicating this market does not follow Western-style holiday seasonality.

Business Questions to Explore Further:

  • What drives the strong Q1 and Q2 performance—demand timing, procurement cycles, or regional market behavior?
  • Why is Q3 structurally weak—lower consumer demand, supply chain constraints, or external market conditions?
  • Should the company reallocate promotional and marketing budgets from Q4 to Q1–Q2 to maximize return on investment?

Q5. How is the distribution of orders across sub-regions within Europe?¶

Visualization:
Diagram 5.jpg

Analysis:

Analysis: Order Distribution of Europe Sub-Region¶

The pie chart illustrates the distribution of orders across European sub-regions. Southern Europe dominates order volume, followed by Eastern and Northern Europe, while Western Europe trails behind.

  • Southern Europe (35.3%): Largest contributor, indicating strong demand and customer base in this sub-region.
  • Eastern Europe (23.3%): Solid share, reflecting consistent engagement.
  • Northern Europe (22.3%): Comparable to Eastern Europe, but slightly smaller.
  • Western Europe (19.2%): The smallest share, showing weaker order activity.

Key Insights:

  1. Southern Europe is the primary driver of European order volume.
  2. Eastern and Northern Europe contribute almost equally, showing balanced mid-level demand.
  3. Western Europe underperforms, suggesting either market saturation, lower engagement, or limited reach.

Business Questions to Explore Further:

  • What factors are driving Southern Europe’s dominance—population size, purchasing power, or marketing effectiveness?
  • Why is Western Europe lagging behind despite being a traditionally strong market?
  • Should the company invest more in Western Europe to boost its performance, or double down on the high-growth potential of Southern Europe?

Q6. What is the relationship between Sales Channel and Total Revenue?¶

Visualization:
Diagram 6.jpg

Analysis:

Revenue Trend by Sales Channel (2010–2017)¶

  • Offline Sales: Generally higher than online, but with greater volatility (sharp drop in 2013, recovery, then decline post-2015).
  • Online Sales: More stable but showed a downward trend after peaking around 2012–2014.
  • 2010–2012: Offline overtook Online and dominated.
  • 2014 onwards: Both channels declined, with Online shrinking faster.

Key Insights:

  1. Significant fluctuations in revenue are observed across both channels (online & offline), with peaks in 2012 and 2015, and sharp declines in 2013 and 2017.
  2. Offline sales were dominant in 2012 and 2015 but proved more volatile with drastic drops (e.g., 2013).
  3. Online sales were relatively stable but have been declining since 2014, suggesting challenges in sustaining digital growth.
  4. Both channels declined in 2016–2017, indicating possible external factors (market downturn, regulation, or shifting customer preferences).
  5. Possible cannibalization between channels: in some years, growth in one channel coincides with a drop in the other (e.g., 2011–2012).

Business Questions to Explore Further:

  • Were external factors (internet penetration, regulation, new competitors) influencing these trends?
  • Is there a correlation between marketing/sales expenses and channel-specific revenue?
  • What strategies can reverse the decline (product diversification, digital transformation, omni-channel approach)?

5. Key Insights & Recommendations¶

Key Insights¶

  • Regional Performance: Europe remains the company’s backbone, but Asia underperforms and “Unknown” regions suggest data quality issues.
  • Seasonality (Total Profit): Profitability peaks in Q1 and Q2, with Q3 showing the weakest performance and Q4 partially recovering.
  • Order Priority: Critical and High priority orders deliver the majority of profits, showing a clear link between urgency and profitability.
  • Fulfillment: Faster delivery times directly correlate with margin improvements.
  • Sub-Regional Trends: Southern Europe is the strongest within Europe, while Western Europe lags in order share.
  • Sales Channel: Offline sales drive higher revenue historically but fluctuate, while online sales are steadier though recently declining.

Recommendations¶

  1. 🌍 Asia Market Strategy

    • Investigate barriers (competition, distribution, localization).
    • Build a phased expansion plan instead of aggressive short-term push.
  2. 📆 Seasonal Marketing Realignment (Q1–Q2 Focus)

    • Concentrate major campaigns in Q1 and Q2 to leverage natural profit peaks.
    • Introduce mid-year promotions in Q3 to counter seasonal weakness.
    • Position Q4 as a retention and upselling period.
  3. 🚚 Supply Chain & Fulfillment Optimization

    • Shorten fulfillment cycles to strengthen margins.
    • Prioritize investments in logistics automation and regional hubs.
  4. 📊 Data Governance

    • Audit “Unknown” region orders to resolve data integrity issues.
    • Reassign misclassified sales to correct regions for better visibility.
  5. 🔝 High-Value Order Focus

    • Maintain focus on Critical & High priority customers with premium service tiers.
    • Explore loyalty programs or expedited delivery benefits to strengthen retention.
  6. 🇪🇺 Europe Sub-Region Strategy

    • Double down on Southern Europe to defend market leadership.
    • Diagnose Western Europe’s lagging performance—is it market maturity, brand perception, or competitor presence?
    • Tailor localized campaigns for Eastern & Northern Europe to unlock balanced growth.
  7. 🛒 Balanced Sales Channel Strategy

    • Stabilize offline sales with better demand forecasting and inventory management.
    • Revitalize online sales through digital campaigns, improved UX, and targeted incentives.
    • Pursue omnichannel integration to smooth volatility across channels.

6. Portfolio Note¶

This project demonstrates the ability to:

  • Formulate business questions
  • Perform exploratory data analysis
  • Translate insights into actionable business recommendations