# End-to-end data pull and merge pipeline for:
#
# 1) Worldwide Governance Indicators (WGI) via World Bank (wbgapi)
# 2) World Development Indicators (WDI) via World Bank (wbgapi):
#    - Natural resource rents (% of GDP): mineral, oil, gas, and total
# 3) Extractive Industries Transparency Initiative (EITI) summary data
#    via the EITI v2.0 API (requests)
#
# Country list:
# - ISO3_LIST is built from EITI /implementing_country and restricted to
#   implementing countries with ≥1 summary_data record in 2015–2021
#   (active_only=False; countries that have left EITI are retained if present).
#
# Outputs:
# - wgi_df: Unbalanced country–year panel of six governance dimensions
#          for ISO3_LIST (2015–2023)
# - wdi_df: Country–year panel of natural resource rents (% of GDP)
#          for ISO3_LIST (2015–2021)
# - eiti_df: Country–year panel of EITI summary totals (government and company),
#           reported currency, and EITI-provided currency_rate (when available),
#           plus USD-converted revenue columns derived in-code
# - merged_df: Final merged panel using wgi_df as the spine (left joins on iso3c + year),
#              with WDI rents appended and EITI government revenue in USD appended
#              (eiti_revenue_government_usd only).
#
# Notes:
# - EITI coverage varies by country and year; country–years without summary data remain NaN.
# - The merged dataset is an unbalanced country–year panel by construction.

import re
import json
import warnings
from typing import List, Dict, Optional

import numpy as np
import pandas as pd
import random
import requests
import time
import wbgapi as wb

import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.colors import LinearSegmentedColormap, TwoSlopeNorm
from matplotlib.cm import ScalarMappable
import geopandas as gpd

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from scipy.stats import ttest_ind
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from statsmodels.stats.sandwich_covariance import cov_cluster
import pycountry

from ipywidgets import SelectMultiple, VBox, Output, HTML
from IPython.display import display, clear_output
# -----------------------------
# 0) CONFIG
#   - do not restrict to active countries (leave_date is missing)
#   - require at least one EITI summary record in 2015–2021
# -----------------------------

EITI_BASE = "https://eiti.org/api/v2.0"

# Years
WGI_YEARS = list(range(2015, 2024))  # 2015–2023 inclusive
WDI_YEARS = list(range(2015, 2022))  # 2015–2021 inclusive

# Indicators
WGI_INDICATORS = {
    "CC.EST": "control_of_corruption",
    "GE.EST": "government_effectiveness",
    "PV.EST": "political_stability",
    "RQ.EST": "regulatory_quality",
    "RL.EST": "rule_of_law",
    "VA.EST": "voice_and_accountability",
}

WDI_RENT_INDICATORS = {
    "NY.GDP.MINR.RT.ZS": "mineral_rents_pct_gdp",
    "NY.GDP.PETR.RT.ZS": "oil_rents_pct_gdp",
    "NY.GDP.NGAS.RT.ZS": "gas_rents_pct_gdp",
    "NY.GDP.TOTL.RT.ZS": "total_natural_resource_rents_pct_gdp",
}

def eiti_get(url, timeout=60):
    r = requests.get(url, headers={"Accept": "application/json"}, timeout=timeout)
    r.raise_for_status()
    return r.json()

# -----------------------------
# Fetch implementing_country
# -----------------------------
payload = eiti_get(f"{EITI_BASE}/implementing_country")
implementing_df = pd.DataFrame(payload.get("data", []))

if implementing_df.empty:
    raise ValueError(
        "EITI /implementing_country returned no data. "
        f"Top-level keys: {list(payload.keys())}"
    )

if "iso3" not in implementing_df.columns:
    raise ValueError(
        "EITI /implementing_country response missing 'iso3' column. "
        f"Columns present: {list(implementing_df.columns)}"
    )

# Clean ISO3
implementing_df["iso3"] = implementing_df["iso3"].astype(str).str.upper().str.strip()
implementing_df = implementing_df[
    implementing_df["iso3"].str.match(r"^[A-Z]{3}$", na=False)
].copy()

# Normalize leave_date (sometimes empty string)
if "leave_date" in implementing_df.columns:
    implementing_df["leave_date"] = implementing_df["leave_date"].replace("", np.nan)


def fetch_eiti_iso3_list(
    implementing_df: pd.DataFrame,
    years_for_summary_check=None,   # e.g., range(2015, 2022) to align with WDI/EITI window
    require_any_summary=True,       # keep countries with >=1 summary record in those years
    active_only=False,              # TRUE = drop countries that have a leave_date (left EITI)
    limit=None,                     # get ALL countries
    sort=True,
    verbose=True
):
    """
    Pull ISO3 codes from EITI /implementing_country endpoint.

    IMPORTANT:
    - 'status.label' in this endpoint is a validation rating (Moderate/High/etc.),
      NOT an implementing/candidate label. So no filter on it.
    - 'active_only' uses leave_date: active countries typically have leave_date missing.
    - 'years_for_summary_check' can ensure the country has at least one summary_data record
      during analysis window.
    - Robustness check: If EITI changes column naming (e.g., summary_data.2015 -> something else),
      this function will fail with a helpful message showing what summary-like
      columns are actually available.
    """
    df = implementing_df.copy()

    # Active-only filter
    if active_only:
        if "leave_date" not in df.columns:
            raise ValueError(
                "active_only=True but 'leave_date' column is missing from the endpoint response."
            )
        df = df[df["leave_date"].isna()].copy()

    # Require at least one summary_data record in chosen years
    if years_for_summary_check is not None and require_any_summary:
        requested = [f"summary_data.{y}" for y in years_for_summary_check]
        year_cols = [c for c in requested if c in df.columns]

        # 🔧 Robustness check (schema change protection)
        if len(year_cols) == 0:
            summary_like_cols = [c for c in df.columns if c.startswith("summary_data")]
            raise ValueError(
                "No 'summary_data.<year>' columns found for the requested years. "
                "This can happen if the EITI API schema changed.\n"
                f"Requested (examples): {requested[:5]} ...\n"
                f"Available columns starting with 'summary_data': {summary_like_cols[:30]}"
                + (" ..." if len(summary_like_cols) > 30 else "")
            )

        df = df[df[year_cols].notna().any(axis=1)].copy()

    iso3_list = df["iso3"].drop_duplicates().tolist()

    if sort:
        iso3_list = sorted(iso3_list)

    if limit is not None:
        iso3_list = iso3_list[: int(limit)]

    return iso3_list

# -----------------------------
# Generate ISO3_LIST for the project
# -----------------------------
ISO3_LIST = fetch_eiti_iso3_list(
    implementing_df=implementing_df,
    years_for_summary_check=range(2015, 2022),
    require_any_summary=True,
    active_only=False,
    limit=None,
    sort=True,
    verbose=True
)

print("ISO3_LIST length:", len(ISO3_LIST))
print(ISO3_LIST)

ISO3_LIST length: 59
['AFG', 'AGO', 'ALB', 'ARG', 'ARM', 'AZE', 'BFA', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'DEU', 'DOM', 'ECU', 'ETH', 'GAB', 'GBR', 'GHA', 'GIN', 'GTM', 'GUY', 'HND', 'IDN', 'IRQ', 'KAZ', 'KGZ', 'LBR', 'MDG', 'MEX', 'MLI', 'MMR', 'MNG', 'MOZ', 'MRT', 'MWI', 'NER', 'NGA', 'NLD', 'NOR', 'PER', 'PHL', 'PNG', 'SEN', 'SLB', 'SLE', 'STP', 'SUR', 'SYC', 'TCD', 'TGO', 'TJK', 'TLS', 'TTO', 'TZA', 'UGA', 'UKR', 'USA', 'ZMB']

def fetch_world_bank_panel(iso3_list, indicator_map, years):
    """
    Pulls country-year panel from World Bank using wbgapi.
    Returns tidy df: iso3c, year, <indicator columns...>
    Works for both single-indicator and multi-indicator pulls.
    """
    if not iso3_list:
        raise ValueError("ISO3_LIST is empty. Fill it with your ISO3 codes first.")

    codes = list(indicator_map.keys())
    raw = wb.data.DataFrame(codes, economy=iso3_list, time=years, labels=False)

    if raw.empty:
        raise ValueError(
            "World Bank pull returned empty DataFrame. "
            "Check indicator codes, ISO3 list, and years."
        )

    # Multi-indicator case: index includes 'series'
    if isinstance(raw.index, pd.MultiIndex) and "series" in raw.index.names:
        tidy = (
            raw.reset_index()
               .melt(id_vars=["economy", "series"], var_name="year", value_name="value")
        )
        tidy["year"] = tidy["year"].astype(str).str.replace("YR", "", regex=False).astype(int)

        tidy = (
            tidy.pivot_table(index=["economy", "year"], columns="series", values="value", aggfunc="first")
                .reset_index()
                .rename(columns={"economy": "iso3c"})
        )

    # Single-indicator case: no 'series' level
    else:
        tidy = (
            raw.reset_index()
               .rename(columns={"economy": "iso3c"})
               .melt(id_vars=["iso3c"], var_name="year", value_name=codes[0])
        )
        tidy["year"] = tidy["year"].astype(str).str.replace("YR", "", regex=False).astype(int)

    tidy = tidy.rename(columns=indicator_map)

    # enforce consistent key formatting
    tidy["iso3c"] = tidy["iso3c"].astype(str).str.upper().str.strip()

    # coerce indicator columns to numeric
    for col in indicator_map.values():
        tidy[col] = pd.to_numeric(tidy[col], errors="coerce")

    tidy = tidy.sort_values(["iso3c", "year"]).reset_index(drop=True)
    return tidy

wgi_df = fetch_world_bank_panel(ISO3_LIST, WGI_INDICATORS, WGI_YEARS)
wdi_df = fetch_world_bank_panel(ISO3_LIST, WDI_RENT_INDICATORS, WDI_YEARS)

def to_float(x):
    if x is None:
        return np.nan
    try:
        return float(str(x).replace(",", ""))
    except Exception:
        return np.nan

def first_non_null(s: pd.Series):
    s = s.dropna()
    return s.iloc[0] if len(s) else np.nan

def get_iso3_from_record(rec):
    return (
        rec.get("country.iso3")
        or (rec.get("country") or {}).get("iso3")
        or ""
    ).upper()

def fetch_eiti_revenue_government_sum_from_implementing_df(
    implementing_df: pd.DataFrame,
    iso3_list,
    years,
    verbose=True
):
    iso3_set = set(c.upper() for c in iso3_list)

    # -----------------------------
    # 1) Filter implementing countries using the already-fetched df
    # -----------------------------
    if implementing_df is None or implementing_df.empty:
        raise ValueError("implementing_df is empty. Fetch /implementing_country first in Block 0.")

    if "iso3" not in implementing_df.columns:
        raise ValueError("implementing_df missing 'iso3' column.")

    countries = implementing_df.copy()
    countries["iso3"] = countries["iso3"].astype(str).str.upper().str.strip()
    countries = countries[countries["iso3"].isin(iso3_set)].copy()

    if verbose:
        print(f"[EITI] implementing_country matched: {len(countries)} / {len(iso3_set)} ISO3 codes")

    # -----------------------------
    # 2) Identify availability
    # -----------------------------
    expected_pairs = []        # summary_id exists
    missing_record_pairs = []  # no summary_id (true missingness)

    for _, row in countries.iterrows():
        for y in years:
            sid = row.get(f"summary_data.{y}", np.nan)
            if pd.notna(sid):
                expected_pairs.append((row["iso3"], int(y), sid))
            else:
                missing_record_pairs.append((row["iso3"], int(y)))

    if verbose:
        print(f"[EITI] country-years with summary records: {len(expected_pairs)}")
        print(f"[EITI] country-years with NO summary record: {len(missing_record_pairs)}")

    # -----------------------------
    # 3) Pull summary records
    # -----------------------------
    rows = []
    failed_calls = []

    for iso3, y, sid in expected_pairs:
        url = f"{EITI_BASE}/summary_data/{sid}"
        try:
            d = eiti_get(url)
        except Exception as e:
            failed_calls.append((iso3, y, sid, str(e)))
            continue

        data_list = d.get("data", [])
        if not data_list:
            failed_calls.append((iso3, y, sid, "Empty or missing 'data' list"))
            continue

        rec = data_list[0]

        rows.append({
            "iso3c": get_iso3_from_record(rec) or iso3,
            "year": y,
            "eiti_revenue_government_sum": to_float(rec.get("revenue_government_sum")),
            "eiti_revenue_company_sum": to_float(rec.get("revenue_company_sum")),
            "eiti_currency": rec.get("currency"),
            "eiti_currency_rate": to_float(rec.get("currency_rate")),
            "eiti_summary_id": sid
        })

    # -----------------------------
    # 4) Deduplicate + clean
    # -----------------------------
    eiti_df = pd.DataFrame(rows)

    if not eiti_df.empty:
        eiti_df = (
            eiti_df.sort_values(["iso3c", "year"])
                   .groupby(["iso3c", "year"], as_index=False)
                   .agg({
                       "eiti_revenue_government_sum": first_non_null,
                       "eiti_revenue_company_sum": first_non_null,
                       "eiti_currency": first_non_null,
                       "eiti_currency_rate": first_non_null,
                       "eiti_summary_id": first_non_null
                   })
        )

    # -----------------------------
    # 5) Metadata summary
    # -----------------------------
    meta = {
        "matched_countries": len(countries),
        "country_years_with_record": len(expected_pairs),
        "country_years_no_record": len(missing_record_pairs),
        "pulled_country_years": eiti_df.shape[0],
        "failed_api_calls": len(failed_calls),
        "missing_record_df": pd.DataFrame(missing_record_pairs, columns=["iso3c", "year"]),
        "failed_calls_df": pd.DataFrame(failed_calls, columns=["iso3c", "year", "summary_id", "error"])
    }

    if verbose:
        print(f"[EITI] pulled country-years: {eiti_df.shape[0]} / {len(expected_pairs)}")
        print(f"[EITI] true API failures: {len(failed_calls)}")

    return eiti_df, meta

# -----------------------------
# Execute EITI pull (2015–2021) using implementing_df
# -----------------------------
eiti_df, eiti_meta = fetch_eiti_revenue_government_sum_from_implementing_df(
    implementing_df=implementing_df, 
    iso3_list=ISO3_LIST,
    years=WDI_YEARS,
    verbose=True
)

print(eiti_df.head())

# -----------------------------
# Convert EITI revenues to USD
# -----------------------------
eiti_df["eiti_revenue_government_usd"] = np.where(
    (eiti_df["eiti_currency_rate"] > 0) & eiti_df["eiti_currency_rate"].notna(),
    eiti_df["eiti_revenue_government_sum"] / eiti_df["eiti_currency_rate"],
    np.nan
)

eiti_df["eiti_revenue_company_usd"] = np.where(
    (eiti_df["eiti_currency_rate"] > 0) & eiti_df["eiti_currency_rate"].notna(),
    eiti_df["eiti_revenue_company_sum"] / eiti_df["eiti_currency_rate"],
    np.nan
)

# Quick dtype check
# print(eiti_df[["eiti_revenue_government_sum", "eiti_currency_rate", "eiti_revenue_government_usd"]].dtypes)

[EITI] implementing_country matched: 59 / 59 ISO3 codes
[EITI] country-years with summary records: 305
[EITI] country-years with NO summary record: 108
[EITI] pulled country-years: 305 / 305
[EITI] true API failures: 0
  iso3c  year  eiti_revenue_government_sum  eiti_revenue_company_sum  \
0   AFG  2015                 3.853342e+07              3.853342e+07   
1   AFG  2016                 2.972585e+07              2.353956e+07   
2   AFG  2017                 4.716721e+07              4.374935e+07   
3   AFG  2018                 6.707700e+07              6.262905e+07   
4   AFG  2019                 5.516008e+07              5.497733e+07   

  eiti_currency  eiti_currency_rate eiti_summary_id  
0           AFN               66.70          AF2015  
1           AFN               67.60          AF2016  
2           AFN               68.65          AF2017  
3           AFN               72.08          AF2018  
4           AFN               77.74          AF2019

# 0) Consistent types/casing across all frames
for df in [wgi_df, wdi_df, eiti_df]:
    df["iso3c"] = df["iso3c"].astype(str).str.upper()
    df["year"]  = df["year"].astype(int)

# 1) Uniqueness checks: prevent accidental row-multiplication on merge
assert wgi_df.duplicated(["iso3c", "year"]).sum() == 0, "wgi_df has duplicate iso3c-year keys."
assert wdi_df.duplicated(["iso3c", "year"]).sum() == 0, "wdi_df has duplicate iso3c-year keys."
assert eiti_df.duplicated(["iso3c", "year"]).sum() == 0, "eiti_df has duplicate iso3c-year keys."

# 2) Merge (WGI as the spine)
merged_df = (
    wgi_df
      .merge(wdi_df, on=["iso3c", "year"], how="left", validate="one_to_one")
      .merge(
          eiti_df[[
              "iso3c",
              "year",
              "eiti_revenue_government_usd",
          ]],
          on=["iso3c", "year"],
          how="left",
          validate="one_to_one"
      )
      .sort_values(["iso3c", "year"])
      .reset_index(drop=True)
)

# 3) Post-merge sanity checks
expected_rows = wgi_df.shape[0]
assert merged_df.shape[0] == expected_rows, (
    f"Row count changed after merge: {merged_df.shape[0]} vs {expected_rows} (wgi_df)."
)
assert merged_df.duplicated(["iso3c", "year"]).sum() == 0, "merged_df has duplicate iso3c-year keys."

# -----------------------------
# DATA VALIDATION: MERGED DATASET
# -----------------------------

# Basic shape and uniqueness
print("Rows:", merged_df.shape[0])
print("Unique country-years:", merged_df[["iso3c", "year"]].drop_duplicates().shape[0])

# Missingness overview (top 15) - FULL PANEL (2015–2023)
print("\nMissingness overview (FULL PANEL 2015–2023) — top 15:")
missing_full = (
    merged_df.isna()
             .mean()
             .sort_values(ascending=False)
)
display(missing_full)

# Missingness overview (top 15) - ANALYSIS WINDOW (2015–2021)
print("\nMissingness overview (2015–2021 ONLY) — top 15:")
merged_2015_2021 = merged_df.loc[merged_df["year"].between(2015, 2021)].copy()

missing_2015_2021 = (
    merged_2015_2021.isna()
                    .mean()
                    .sort_values(ascending=False)
)
display(missing_2015_2021)

# Quick row count sanity checks for expected panel sizes
print("\nSanity checks:")
print("Rows in 2015–2021 subset:", merged_2015_2021.shape[0])
print("Rows in 2022–2023 subset:", merged_df.loc[merged_df["year"].between(2022, 2023)].shape[0])

Rows: 531
Unique country-years: 531

Missingness overview (FULL PANEL 2015–2023) — top 15:

eiti_revenue_government_usd             0.425612
gas_rents_pct_gdp                       0.235405
mineral_rents_pct_gdp                   0.222222
oil_rents_pct_gdp                       0.222222
total_natural_resource_rents_pct_gdp    0.222222
iso3c                                   0.000000
year                                    0.000000
control_of_corruption                   0.000000
government_effectiveness                0.000000
political_stability                     0.000000
rule_of_law                             0.000000
regulatory_quality                      0.000000
voice_and_accountability                0.000000
dtype: float64

Missingness overview (2015–2021 ONLY) — top 15:

eiti_revenue_government_usd             0.261501
gas_rents_pct_gdp                       0.016949
iso3c                                   0.000000
year                                    0.000000
control_of_corruption                   0.000000
government_effectiveness                0.000000
political_stability                     0.000000
rule_of_law                             0.000000
regulatory_quality                      0.000000
voice_and_accountability                0.000000
mineral_rents_pct_gdp                   0.000000
oil_rents_pct_gdp                       0.000000
total_natural_resource_rents_pct_gdp    0.000000
dtype: float64

Sanity checks:
Rows in 2015–2021 subset: 413
Rows in 2022–2023 subset: 118

WGI_COLS = [
    "control_of_corruption",
    "government_effectiveness",
    "political_stability",
    "regulatory_quality",
    "rule_of_law",
    "voice_and_accountability",
]

TITLE_MAP = {
    "control_of_corruption": "Control of Corruption",
    "government_effectiveness": "Government Effectiveness",
    "political_stability": "Political Stability",
    "regulatory_quality": "Regulatory Quality",
    "rule_of_law": "Rule of Law",
    "voice_and_accountability": "Voice and Accountability",
}

PERIODS = [
    ("pre_pandemic", "Pre-pandemic", 2015, 2019),
    ("pandemic",     "Pandemic",     2020, 2021),
    ("post_covid",   "Post-COVID",   2022, 2023),
]

PERIOD_COLORS = {
    "Pre-pandemic": "#D9D9D9",
    "Pandemic":     "#FFE066",
    "Post-COVID":   "#BFD7EA",
}
PERIOD_ALPHA = 0.40

WINDOW_START, WINDOW_END = 2015, 2023
ALL_YEARS = list(range(WINDOW_START, WINDOW_END + 1))

PRE_START, PRE_END = 2015, 2019
POST_START, POST_END = 2022, 2023

TOL = 0.02

# compute wgi_mean
if "wgi_mean" not in merged_df.columns:
    merged_df["wgi_mean"] = merged_df[WGI_COLS].mean(axis=1)

# =====================================================
# VISUAL: Revenues & Governance Over Time
# - LEFT  : EITI Govt Revenues (USD, billions)
# - RIGHT : WDI Total Natural Resource Rents (% GDP) vs Governance (WGI mean)
# =====================================================

# -----------------------------
# 0) Safety checks + prep
# -----------------------------
needed = {
    "year",
    "eiti_revenue_government_usd",
    "total_natural_resource_rents_pct_gdp",
    "wgi_mean"
}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"merged_df missing required columns: {sorted(missing)}")

df = merged_df.loc[
    merged_df["year"].between(WINDOW_START, WINDOW_END),
    [
        "year",
        "eiti_revenue_government_usd",
        "total_natural_resource_rents_pct_gdp",
        "wgi_mean"
    ]
].copy()

df["year"] = df["year"].astype(int)
df["eiti_revenue_government_usd"] = pd.to_numeric(
    df["eiti_revenue_government_usd"], errors="coerce"
)
df["total_natural_resource_rents_pct_gdp"] = pd.to_numeric(
    df["total_natural_resource_rents_pct_gdp"], errors="coerce"
)
df["wgi_mean"] = pd.to_numeric(df["wgi_mean"], errors="coerce")

# -----------------------------
# 1) Aggregations
# -----------------------------
# LEFT: EITI revenues in USD (convert to billions)
rev_usd = (
    df.groupby("year", as_index=False)
      .agg(eiti_revenue_usd=("eiti_revenue_government_usd", "sum"))
)
rev_usd["eiti_revenue_usd"] = rev_usd["eiti_revenue_usd"] / 1e9  # billions USD

# RIGHT: WDI rents + governance
yearly = (
    df.groupby("year", as_index=False)
      .agg(
          rents_pct_gdp=("total_natural_resource_rents_pct_gdp", "mean"),
          wgi_mean=("wgi_mean", "mean")
      )
)

# Ensure full year coverage
rev_usd = pd.DataFrame({"year": ALL_YEARS}).merge(rev_usd, on="year", how="left")
yearly  = pd.DataFrame({"year": ALL_YEARS}).merge(yearly,  on="year", how="left")

# -----------------------------
# 2) Figure + subplots
# -----------------------------
fig, axes = plt.subplots(
    nrows=1, ncols=2,
    figsize=(16, 5),
    sharex=True
)

# =====================================================
# LEFT: EITI Govt Revenues (USD, billions)
# =====================================================
axL = axes[0]

for _, label, start, end in PERIODS:
    axL.axvspan(
        start - 0.5, end + 0.5,
        color=PERIOD_COLORS[label],
        alpha=PERIOD_ALPHA,
        lw=0
    )

axL.bar(
    rev_usd["year"],
    rev_usd["eiti_revenue_usd"]
)

axL.set_title(
    "EITI Government Revenues Over Time",
    fontsize=13,
    fontweight="bold"
)
axL.set_xlabel("Year")
axL.set_ylabel("EITI govt revenue (USD, billions)")
axL.set_xticks(ALL_YEARS)
axL.grid(True, axis="y", alpha=0.25)
axL.set_axisbelow(True)

# =====================================================
# RIGHT: WDI Resource Rents vs Governance
# =====================================================
axR1 = axes[1]
axR2 = axR1.twinx()

for _, label, start, end in PERIODS:
    axR1.axvspan(
        start - 0.5, end + 0.5,
        color=PERIOD_COLORS[label],
        alpha=PERIOD_ALPHA,
        lw=0
    )

# WDI resource rents
axR1.plot(
    yearly["year"],
    yearly["rents_pct_gdp"],
    marker="o",
    linewidth=2,
    color="#1f77b4",
    label="Avg WDI total natural resource rents (% GDP)"
)
axR1.set_ylabel("Avg total natural resource rents (% of GDP)")

# Governance
axR2.plot(
    yearly["year"],
    yearly["wgi_mean"],
    marker="o",
    linewidth=2,
    color="#d62728",
    label="Avg governance (WGI mean)"
)
axR2.set_ylabel("Avg governance (WGI mean)")

axR1.set_title(
    "Resource Rents and Governance Over Time",
    fontsize=13,
    fontweight="bold"
)
axR1.set_xlabel("Year")
axR1.set_xticks(ALL_YEARS)
axR1.grid(True, axis="y", alpha=0.25)
axR1.set_axisbelow(True)

# Combined legend (right panel)
lines1, labels1 = axR1.get_legend_handles_labels()
lines2, labels2 = axR2.get_legend_handles_labels()
axR2.legend(
    lines1 + lines2,
    labels1 + labels2,
    frameon=False,
    loc="upper right"
)

# =====================================================
# Shared period legend
# =====================================================
period_handles = [
    Patch(
        facecolor=PERIOD_COLORS[label],
        edgecolor="none",
        alpha=PERIOD_ALPHA,
        label=label
    )
    for _, label, _, _ in PERIODS
]

fig.legend(
    handles=period_handles,
    loc="lower center",
    ncol=3,
    frameon=True,
    bbox_to_anchor=(0.5, -0.02)
)

plt.tight_layout(rect=[0, 0.08, 1, 1])
plt.show()

# =====================================================
# Trends in resource rent across EITI countries
# =====================================================

rent_cols = [
    "mineral_rents_pct_gdp",
    "oil_rents_pct_gdp",
    "gas_rents_pct_gdp",
    "total_natural_resource_rents_pct_gdp"
]

# Compute average rent trends
rent_trend = merged_df.groupby("year")[rent_cols].mean().reset_index()
rent_trend = pd.DataFrame({"year": list(range(2015, 2024))}).merge(rent_trend, how="left")

fig2, axes2 = plt.subplots(2, 2, figsize=(14, 8), sharex=True)
axes2 = axes2.flatten()

rent_titles = ["Mineral Rents", "Oil Rents", "Gas Rents", "Total Natural Resource Rents"]
rent_colors = ["#8B4513", "#2C3E50", "#E67E22", "#27AE60"]

for ax, col, title_text, color in zip(axes2, rent_cols, rent_titles, rent_colors):

    # Period shading
    for _, label, start, end in PERIODS:
        ax.axvspan(
            start - 0.5, end + 0.5,
            alpha=PERIOD_ALPHA,
            color=PERIOD_COLORS[label],
            lw=0
        )

    # Plot series
    ax.plot(rent_trend["year"], rent_trend[col],
            marker="o", linewidth=2.5, color=color)

    # Formatting
    ax.axhline(0, linestyle="--", linewidth=1, alpha=0.3, color="gray")
    ax.set_title(title_text, fontweight="bold", fontsize=11)
    ax.set_ylabel("% of GDP", fontsize=9)
    ax.grid(True, alpha=0.25)
    ax.set_xticks(list(range(2015, 2024)))

# tick labels
for ax in axes2:
    ax.tick_params(axis="x", labelbottom=True)

# Fixed y-limits per panel
axes2[0].set_ylim(0, 5)    # Mineral
axes2[1].set_ylim(0, 5)    # Oil
axes2[2].set_ylim(0, 2)    # Gas
axes2[3].set_ylim(0, 14)   # Total

# X-axis labels only on bottom row
for ax in axes2[2:]:
    ax.set_xlabel("Year", fontsize=9)

# Legend
period_handles = [
    Patch(facecolor=PERIOD_COLORS[label],
          edgecolor="none",
          alpha=PERIOD_ALPHA,
          label=label)
    for _, label, _, _ in PERIODS
]

fig2.legend(
    handles=period_handles,
    loc="lower center",
    ncol=3,
    frameon=True,
    bbox_to_anchor=(0.5, -0.02)
)

# Title
fig2.suptitle(
    "Resource Rents Trends Across EITI Countries",
    y=1.01,
    fontsize=15,
    fontweight="bold"
)

plt.tight_layout(rect=[0, 0.03, 1, 1])
plt.show()

# =====================================================
# VISUAL: WGI TRENDS (AVG ACROSS EITI COUNTRIES)
# - are the series trending up/down, and when (pre/pandemic/post)
# =====================================================

def add_period_labels(df, year_col="year"):
    out = df.copy()
    out[year_col] = out[year_col].astype(int)
    out["period"] = np.nan
    for key, _, start, end in PERIODS:
        out.loc[out[year_col].between(start, end), "period"] = key
    return out

needed = {"year", *WGI_COLS}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"merged_df missing required columns: {sorted(missing)}")

dfv = merged_df.copy()
dfv["year"] = dfv["year"].astype(int)

trend = (
    dfv.loc[dfv["year"].between(2015, 2023), ["year"] + WGI_COLS]
       .groupby("year", as_index=False)
       .mean(numeric_only=True)
)

trend = pd.DataFrame({"year": ALL_YEARS}).merge(trend, on="year", how="left")

fig, axes = plt.subplots(2, 3, figsize=(16, 8), sharex=True)
axes = axes.flatten()

for ax, col in zip(axes, WGI_COLS):
    for _, label, start, end in PERIODS:
        ax.axvspan(start - 0.5, end + 0.5, alpha=PERIOD_ALPHA,
                   color=PERIOD_COLORS[label], lw=0)

    ax.plot(trend["year"], trend[col], marker="o", linewidth=2)
    ax.axhline(0, linestyle="--", linewidth=1)

    ax.set_title(
    TITLE_MAP.get(col, col),
    fontweight="bold",
    fontsize=11
)
    ax.set_xlabel("Year")
    ax.set_ylabel("Average Score")
    ax.grid(True, alpha=0.25)

    ax.set_xticks(ALL_YEARS)
    ax.tick_params(axis="x", labelbottom=True)

legend_handles = [
    Patch(facecolor=PERIOD_COLORS[label], edgecolor="none",
          alpha=PERIOD_ALPHA, label=label)
    for _, label, _, _ in PERIODS
]
fig.legend(handles=legend_handles, loc="lower center", ncol=3,
           frameon=True, bbox_to_anchor=(0.5, -0.02))

fig.suptitle(
    "WGI Trends Across EITI Countries (2015–2023)",
    y=1.01,
    fontsize=15,
    fontweight="bold"
)
plt.tight_layout(rect=[0, 0.06, 1, 1])
plt.show()

# =====================================================
# VISUAL: Country-level governance change world map
#   Δ = mean(WGI POST) − mean(WGI PRE)
# - adds geography: where improvements/declines are concentrated
# =====================================================

# -----------------------------
# 0) Safety checks + prep
# -----------------------------
needed = {"iso3c", "year", "wgi_mean"}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"merged_df missing required columns: {sorted(missing)}")

df = merged_df.loc[
    merged_df["year"].between(WINDOW_START, WINDOW_END),
    ["iso3c", "year", "wgi_mean"]
].copy()

df["iso3c"] = df["iso3c"].astype(str).str.upper().str.strip()
df["year"] = df["year"].astype(int)
df["wgi_mean"] = pd.to_numeric(df["wgi_mean"], errors="coerce")
df = df.dropna(subset=["iso3c", "year", "wgi_mean"])

# -----------------------------
# 1) Compute country-level PRE vs POST means + Δ
# -----------------------------
pre = (
    df.loc[df["year"].between(PRE_START, PRE_END)]
      .groupby("iso3c", as_index=False)["wgi_mean"]
      .mean()
      .rename(columns={"wgi_mean": "pre_mean"})
)

post = (
    df.loc[df["year"].between(POST_START, POST_END)]
      .groupby("iso3c", as_index=False)["wgi_mean"]
      .mean()
      .rename(columns={"wgi_mean": "post_mean"})
)

delta_df = (
    pre.merge(post, on="iso3c", how="inner")
       .assign(delta_post_minus_pre=lambda d: d["post_mean"] - d["pre_mean"])
       .loc[:, ["iso3c", "pre_mean", "post_mean", "delta_post_minus_pre"]]
       .copy()
)

# keep only valid deltas
delta_df["delta_post_minus_pre"] = pd.to_numeric(delta_df["delta_post_minus_pre"], errors="coerce")
delta_df = delta_df.dropna(subset=["delta_post_minus_pre"])

# -----------------------------
# 2) Load world boundaries (Natural Earth) + merge Δ
# -----------------------------
WORLD_URL = (
    "https://naturalearth.s3.amazonaws.com/50m_cultural/"
    "ne_50m_admin_0_countries.zip"
)

world = gpd.read_file(WORLD_URL)
world = world.loc[:, ["ISO_A3", "ADMIN", "geometry"]].rename(
    columns={"ISO_A3": "iso3c", "ADMIN": "country"}
)
world["iso3c"] = world["iso3c"].astype(str).str.upper().str.strip()

map_df = world.merge(
    delta_df.loc[:, ["iso3c", "delta_post_minus_pre"]],
    on="iso3c",
    how="left"
)

# for ranking/tables: only those that exist on the map + have delta
available = map_df.dropna(subset=["delta_post_minus_pre"]).copy()

# -----------------------------
# 3) Choose label countries (top/bottom K by Δ)
# -----------------------------
TOP_K = 10

top_improvers_iso3 = available.nlargest(TOP_K, "delta_post_minus_pre")["iso3c"].tolist()
top_decliners_iso3 = available.nsmallest(TOP_K, "delta_post_minus_pre")["iso3c"].tolist()
label_iso3 = set(top_improvers_iso3 + top_decliners_iso3)

# -----------------------------
# 4) Plot map
# -----------------------------
cmap = plt.cm.RdBu  # red = negative, blue = positive
vmax = float(np.nanmax(np.abs(map_df["delta_post_minus_pre"])))
norm = TwoSlopeNorm(vmin=-vmax, vcenter=0, vmax=vmax)

fig, ax = plt.subplots(figsize=(16, 8))

# base layer (all countries)
map_df.plot(ax=ax, color="#EEEEEE", edgecolor="white", linewidth=0.4)

# data layer (only those with delta)
map_df.dropna(subset=["delta_post_minus_pre"]).plot(
    ax=ax,
    column="delta_post_minus_pre",
    cmap=cmap,
    norm=norm,
    edgecolor="white",
    linewidth=0.4
)

ax.set_title(
    "Country-level Change in Governance (Mean WGI)",
    fontsize=14,
    fontweight="bold",
    pad=22
)

ax.text(
    0.5, 1.01,
    f"Δ = mean(WGI {POST_START}–{POST_END}) − mean(WGI {PRE_START}–{PRE_END})",
    transform=ax.transAxes,
    ha="center",
    va="bottom",
    fontsize=12
)

ax.set_axis_off()

# consistent view window
ax.set_xlim(-170, 190)
ax.set_ylim(-60, 85)

# -----------------------------
# 5) Label top improvers/decliners (ISO3 + Δ)
# -----------------------------
label_df = map_df.loc[map_df["iso3c"].isin(label_iso3)].dropna(subset=["delta_post_minus_pre"]).copy()

# representative_point avoids multipolygon centroid issues
label_df["pt"] = label_df.geometry.representative_point()
label_df["x"] = label_df["pt"].x
label_df["y"] = label_df["pt"].y

for _, r in label_df.iterrows():
    ax.text(
        r["x"] + 0.8,
        r["y"] + 0.6,
        f"{r['iso3c']}\n{r['delta_post_minus_pre']:+.2f}",
        fontsize=8,
        ha="left",
        va="center",
        bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="black", lw=0.5, alpha=0.9),
        zorder=5
    )

# -----------------------------
# 6) Colorbar (bottom)
# -----------------------------
sm = ScalarMappable(norm=norm, cmap=cmap)
sm._A = []

cbar = fig.colorbar(
    sm, ax=ax,
    orientation="horizontal",
    fraction=0.045,
    pad=0.06
)
cbar.set_label("Change in governance (Δ wgi_mean)", fontsize=11)
cbar.ax.tick_params(labelsize=10)

plt.tight_layout()
plt.show()

# -----------------------------
# 7) Side-by-side tables: Top improvers vs decliners
# -----------------------------
rank_df = (
    available.loc[:, ["iso3c", "country", "delta_post_minus_pre"]]
             .drop_duplicates(subset=["iso3c"])
             .rename(columns={"delta_post_minus_pre": "delta"})
             .copy()
)

rank_df["delta"] = pd.to_numeric(rank_df["delta"], errors="coerce")

top_improvers_tbl = rank_df.nlargest(TOP_K, "delta").assign(delta=lambda d: d["delta"].round(3))
top_decliners_tbl = rank_df.nsmallest(TOP_K, "delta").assign(delta=lambda d: d["delta"].round(3))

fig, axes = plt.subplots(1, 2, figsize=(12, 3))
for ax in axes:
    ax.axis("off")

# left table
axes[0].set_title("Top Improvers (Δ wgi_mean)", fontsize=10, fontweight="bold", pad=10)
t1 = axes[0].table(
    cellText=top_improvers_tbl[["iso3c", "country", "delta"]].values,
    colLabels=["ISO3", "Country", "Δ wgi_mean"],
    loc="center",
    cellLoc="center",
    bbox=[0, 0.05, 1, 0.9]
)
t1.auto_set_font_size(False)
t1.set_fontsize(9)
t1.scale(1, 1.4)
for (row, col), cell in t1.get_celld().items():
    if row == 0:
        cell.set_text_props(weight="bold")

# right table
axes[1].set_title("Top Decliners (Δ wgi_mean)", fontsize=10, fontweight="bold", pad=10)
t2 = axes[1].table(
    cellText=top_decliners_tbl[["iso3c", "country", "delta"]].values,
    colLabels=["ISO3", "Country", "Δ wgi_mean"],
    loc="center",
    cellLoc="center",
    bbox=[0, 0.05, 1, 0.9]
)
t2.auto_set_font_size(False)
t2.set_fontsize(9)
t2.scale(1, 1.4)
for (row, col), cell in t2.get_celld().items():
    if row == 0:
        cell.set_text_props(weight="bold")

plt.tight_layout()
plt.show()

# =====================================================
# VISUAL: Average Governance Level world map (overall level)
#   Level = country mean of wgi_mean over 2015-2023
# - shows starting/overall levels (important because “improving” ≠ “good”; some improve from very low baselines
# =====================================================

# -----------------------------
# 0) Safety checks + compute country-level levels
# -----------------------------
needed = {"iso3c", "year", "wgi_mean"}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"merged_df missing required columns: {sorted(missing)}. "
                     f"Make sure you computed wgi_mean earlier.")

df = merged_df.loc[
    merged_df["year"].between(WINDOW_START, WINDOW_END),
    ["iso3c", "year", "wgi_mean"]
].copy()

df["iso3c"] = df["iso3c"].astype(str).str.upper().str.strip()
df["year"] = df["year"].astype(int)

level_df = (
    df.groupby("iso3c", as_index=False)["wgi_mean"]
      .mean()
      .rename(columns={"wgi_mean": "avg_wgi_level"})
)

# -----------------------------
# 1) Load world boundaries
# -----------------------------
WORLD_URL = (
    "https://naturalearth.s3.amazonaws.com/50m_cultural/"
    "ne_50m_admin_0_countries.zip"
)

world = gpd.read_file(WORLD_URL)
world = world.loc[:, ["ISO_A3", "ADMIN", "geometry"]].rename(
    columns={"ISO_A3": "iso3c", "ADMIN": "country"}
)
world["iso3c"] = world["iso3c"].astype(str).str.upper().str.strip()

# -----------------------------
# 2) Merge LEVEL onto map
# -----------------------------
map_df = world.merge(level_df, on="iso3c", how="left")

# -----------------------------
# 3) Select countries to label (Top/Bottom K by level)
# -----------------------------
TOP_K = 10

available = map_df.dropna(subset=["avg_wgi_level"]).copy()

top_iso3 = available.nlargest(TOP_K, "avg_wgi_level")["iso3c"].tolist()
bot_iso3 = available.nsmallest(TOP_K, "avg_wgi_level")["iso3c"].tolist()
label_iso3 = set(top_iso3 + bot_iso3)

# -----------------------------
# 4) Plot map
# -----------------------------
cmap = plt.cm.RdYlBu  # low=red, high=blue
vmax = float(np.nanmax(np.abs(map_df["avg_wgi_level"])))
norm = TwoSlopeNorm(vmin=-vmax, vcenter=0.0, vmax=vmax)

fig, ax = plt.subplots(figsize=(16, 8))

# base layer
map_df.plot(ax=ax, color="#EEEEEE", edgecolor="white", linewidth=0.4)

# data layer
map_df.dropna(subset=["avg_wgi_level"]).plot(
    ax=ax,
    column="avg_wgi_level",
    cmap=cmap,
    norm=norm,
    edgecolor="white",
    linewidth=0.4
)

ax.set_title(
    "Average Governance Level (Mean WGI)",
    fontsize=14,
    fontweight="bold",
    pad=22
)

ax.text(
    0.5, 1.01,
    f"6 WGI Dimensions Averaged Across {WINDOW_START}–{WINDOW_END}",
    transform=ax.transAxes,
    ha="center",
    va="bottom",
    fontsize=13
)
ax.set_axis_off()
ax.set_xlim(-170, 190)
ax.set_ylim(-60, 85)

# -----------------------------
# 5) Labels (Top/Bottom K)
# -----------------------------
label_df = map_df.loc[map_df["iso3c"].isin(label_iso3)].dropna(subset=["avg_wgi_level"]).copy()

label_df["pt"] = label_df.geometry.representative_point()
label_df["x"] = label_df["pt"].x
label_df["y"] = label_df["pt"].y

for _, r in label_df.iterrows():
    ax.text(
        r["x"] + 0.8,
        r["y"] + 0.6,
        f"{r['iso3c']}\n{r['avg_wgi_level']:+.2f}",
        fontsize=8,
        ha="left",
        va="center",
        bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="black", lw=0.5, alpha=0.9),
        zorder=5
    )

# -----------------------------
# 6) Colorbar (bottom)
# -----------------------------
sm = ScalarMappable(norm=norm, cmap=cmap)
sm._A = []

cbar = fig.colorbar(
    sm, ax=ax,
    orientation="horizontal",
    fraction=0.045,
    pad=0.06
)
cbar.set_label(f"Average WGI score ({WINDOW_START}–{WINDOW_END})", fontsize=11)
cbar.ax.tick_params(labelsize=10)

plt.tight_layout()
plt.show()

# -----------------------------
# 7) Side-by-side tables (Top vs Bottom K)
# -----------------------------
rank_df = (
    available.loc[:, ["iso3c", "country", "avg_wgi_level"]]
             .drop_duplicates(subset=["iso3c"])
             .rename(columns={"avg_wgi_level": "avg_wgi"})
             .copy()
)

top_tbl = rank_df.nlargest(TOP_K, "avg_wgi").assign(avg_wgi=lambda d: d["avg_wgi"].round(3))
bot_tbl = rank_df.nsmallest(TOP_K, "avg_wgi").assign(avg_wgi=lambda d: d["avg_wgi"].round(3))

fig, axes = plt.subplots(1, 2, figsize=(12, 3))
for a in axes:
    a.axis("off")

axes[0].set_title(f"Top {TOP_K} Governance Levels", fontsize=10, fontweight="bold", pad=10)
axes[0].text(
    0.5, 0.98,
    f"(Avg WGI {WINDOW_START}–{WINDOW_END})",
    transform=axes[0].transAxes,
    ha="center", va="bottom",
    fontsize=10, fontweight="normal"
)
t1 = axes[0].table(
    cellText=top_tbl[["iso3c", "country", "avg_wgi"]].values,
    colLabels=["ISO3", "Country", "Avg WGI"],
    loc="center",
    cellLoc="center",
    colWidths=[0.18, 0.55, 0.27],
    bbox=[0, 0.05, 1, 0.9]
)
t1.auto_set_font_size(False)
t1.set_fontsize(9)
t1.scale(1, 1.4)
for (row, col), cell in t1.get_celld().items():
    if row == 0:
        cell.set_text_props(weight="bold")

axes[1].set_title(f"Bottom {TOP_K} Governance Levels", fontsize=10, fontweight="bold", pad=10)
axes[1].text(
    0.5, 0.98,
    f"(Avg WGI {WINDOW_START}–{WINDOW_END})",
    transform=axes[1].transAxes,
    ha="center", va="bottom",
    fontsize=10, fontweight="normal"
)
t2 = axes[1].table(
    cellText=bot_tbl[["iso3c", "country", "avg_wgi"]].values,
    colLabels=["ISO3", "Country", "Avg WGI"],
    loc="center",
    cellLoc="center",
    colWidths=[0.18, 0.55, 0.27],
    bbox=[0, 0.05, 1, 0.9]
)
t2.auto_set_font_size(False)
t2.set_fontsize(9)
t2.scale(1, 1.4)
for (row, col), cell in t2.get_celld().items():
    if row == 0:
        cell.set_text_props(weight="bold")

plt.tight_layout()
plt.show()

# =====================================================
# VISUAL: Correlation matrix of WGI dimensions
#   - Pearson correlations
#   - Country–year observations (pooled, 2015–2023)
#   - Lower triangle only + annotated coefficients
# - shows the dimensions move together (supports using wgi_mean as a summary outcome)
# =====================================================

# -----------------------------
# 0) Safety check
# -----------------------------
needed = {"year", *WGI_COLS}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"merged_df missing required columns: {sorted(missing)}")

# -----------------------------
# 1) Prepare data (analysis window)
# -----------------------------
corr_df = (
    merged_df.loc[merged_df["year"].between(WINDOW_START, WINDOW_END), WGI_COLS]
             .dropna()
)

# -----------------------------
# 2) Correlation matrix
# -----------------------------
corr_matrix = corr_df.corr(method="pearson")

# -----------------------------
# 3) Mask upper triangle
# -----------------------------
mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)

# -----------------------------
# 4) Plot
# -----------------------------
fig, ax = plt.subplots(figsize=(7.5, 6.5))

im = ax.imshow(
    corr_matrix,
    cmap="Blues",
    vmin=0, vmax=1  # 0 = white, 1 = darkest
)

# ticks & labels
pretty_labels = [TITLE_MAP.get(c, c.replace("_", " ").title()) for c in WGI_COLS]
ax.set_xticks(range(len(WGI_COLS)))
ax.set_yticks(range(len(WGI_COLS)))
ax.set_xticklabels(pretty_labels, rotation=45, ha="right")
ax.set_yticklabels(pretty_labels)

# annotate lower triangle only
for i in range(len(WGI_COLS)):
    for j in range(len(WGI_COLS)):
        if not mask[i, j]:
            ax.text(
                j, i,
                f"{corr_matrix.iloc[i, j]:.2f}",
                ha="center", va="center",
                fontsize=9
            )

# colorbar
cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
cbar.set_label("Pearson correlation", fontsize=11)
cbar.ax.tick_params(labelsize=10)

ax.set_title("Correlation Matrix of WGI Dimensions (2015–2023)", fontsize=11,  fontweight="bold")
plt.tight_layout()
plt.show()

# =====================================================
# VISUAL: Governance Clustering (k=2 vs k=3)
#   - Features: baseline (2019–2021 mean of 6 WGI dims) + change (POST − PRE by dim)
#   - Shows:
#       (1) Silhouette analysis (k=2..7)
#       (2) Cluster summaries (means)
#       (3) PCA scatter: k=2 and k=3 side by side (same PCA space)
#       (4) Compact membership tables (ISO3s)
# =====================================================

# -----------------------------------------------------
# 0) Safety checks + minimal prep
# -----------------------------------------------------
needed = {"iso3c", "year", *WGI_COLS}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"merged_df missing required columns: {sorted(missing)}")

df = merged_df.loc[
    merged_df["year"].between(WINDOW_START, WINDOW_END),
    ["iso3c", "year"] + WGI_COLS
].copy()

df["iso3c"] = df["iso3c"].astype(str).str.upper().str.strip()
df["year"] = pd.to_numeric(df["year"], errors="coerce")

for c in WGI_COLS:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.dropna(subset=["iso3c", "year"])

# -----------------------------------------------------
# 1) Construct clustering features: baseline + delta (per dimension)
# -----------------------------------------------------
BASE_START, BASE_END = 2019, 2021

baseline = (
    df.loc[df["year"].between(BASE_START, BASE_END)]
      .groupby("iso3c")[WGI_COLS]
      .mean()
      .dropna()
      .rename(columns=lambda c: f"{c}_baseline_{BASE_START}_{BASE_END}")
)

pre_means = (
    df.loc[df["year"].between(PRE_START, PRE_END)]
      .groupby("iso3c")[WGI_COLS]
      .mean()
)

post_means = (
    df.loc[df["year"].between(POST_START, POST_END)]
      .groupby("iso3c")[WGI_COLS]
      .mean()
)

delta = (post_means - pre_means).rename(columns=lambda c: f"delta_{c}")

X = baseline.join(delta, how="inner").dropna()

print(f"Clustering feature matrix X: n_countries={X.shape[0]}, n_features={X.shape[1]}")
if X.shape[0] < 5:
    raise ValueError(f"Too few countries with complete clustering features (n={X.shape[0]}).")

baseline_cols = [c for c in X.columns if "baseline" in c]
if len(baseline_cols) == 0:
    raise ValueError("No baseline columns found in X (expected columns containing 'baseline').")

# -----------------------------------------------------
# 2) Standardize features
# -----------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------------------------------
# 3) Silhouette analysis (k = 2..7)
# -----------------------------------------------------
sil_scores = {}
for k in range(2, 8):
    km = KMeans(n_clusters=k, random_state=42, n_init=20)
    labels = km.fit_predict(X_scaled)
    sil_scores[k] = float(silhouette_score(X_scaled, labels))

fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(list(sil_scores.keys()), list(sil_scores.values()), marker="o")
ax.set_xlabel("Number of clusters (k)")
ax.set_ylabel("Silhouette score")
ax.set_title("Silhouette Analysis (K-means)")
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print("Silhouette scores:")
for k in sorted(sil_scores):
    print(f"  k={k}: {sil_scores[k]:.3f}")

# -----------------------------------------------------
# 4) Fit K-means for k=2 and k=3 and label clusters by baseline level
# -----------------------------------------------------
def make_cluster_labels(cluster_df, baseline_cols, k):
    # order clusters from low -> high baseline governance
    cluster_means = cluster_df.groupby("cluster")[baseline_cols].mean()
    order = cluster_means.mean(axis=1).sort_values().index.tolist()

    if k == 2:
        return {order[0]: "Non-Strong Governance", order[1]: "Strong Governance"}
    if k == 3:
        return {order[0]: "Weak Governance", order[1]: "Moderate Governance", order[2]: "Strong Governance"}
    return {cl: f"Cluster {cl}" for cl in order}

results = {}
for k in [2, 3]:
    km = KMeans(n_clusters=k, random_state=42, n_init=20)
    cl = km.fit_predict(X_scaled)

    cluster_df = X.copy()
    cluster_df["cluster"] = cl
    label_map = make_cluster_labels(cluster_df, baseline_cols, k)
    cluster_df["cluster_label"] = cluster_df["cluster"].map(label_map)

    results[k] = cluster_df

# -----------------------------------------------------
# 5) Cluster summaries (means)
# -----------------------------------------------------
for k in [2, 3]:
    print(f"\n--- Cluster summary means (k={k}) ---")
    display(
        results[k]
        .groupby("cluster")[X.columns]
        .mean()
        .round(2)
    )
    print("Cluster sizes:")
    print(results[k]["cluster_label"].value_counts())

# -----------------------------------------------------
# 6) PCA computed. plot for k=2 and k=3 + ISO3 labels on extremes
# -----------------------------------------------------
pca = PCA(n_components=2)
pcs = pca.fit_transform(X_scaled)

print("\nPCA explained variance ratio:", np.round(pca.explained_variance_ratio_, 3))

# -----------------------------
# PC loadings (what each PC represents)
# -----------------------------
loadings = pd.DataFrame(
    pca.components_.T,
    index=X.columns,
    columns=["PC1_loading", "PC2_loading"]
)

pc1_tbl = (
    loadings[["PC1_loading"]]
    .assign(abs_PC1=lambda d: d["PC1_loading"].abs())
    .sort_values("abs_PC1", ascending=False)
    .drop(columns="abs_PC1")
)

print("\nPC1 loadings (sorted by absolute magnitude):")
display(pc1_tbl.round(3))

print("\nTop 6 contributors to PC1:")
display(pc1_tbl.head(6).round(3))

print("\nBottom 6 contributors to PC1:")
display(pc1_tbl.tail(6).round(3))

def iso3_to_country(iso3):
    try:
        c = pycountry.countries.get(alpha_3=iso3)
        return c.name if c else iso3
    except Exception:
        return iso3

# -----------------------------
# PCA scores (shared across k=2 and k=3)
# -----------------------------
pca_base = pd.DataFrame(pcs, columns=["PC1", "PC2"], index=X.index)

# Choose extreme points once (same labels on both panels)
pca_base["dist"] = np.sqrt(pca_base["PC1"]**2 + pca_base["PC2"]**2)

LABEL_N = 10
label_iso3 = pca_base.nlargest(LABEL_N, "dist").index.tolist()

# Build ISO3 -> Country legend text (for labeled points only)
iso_country_pairs = [(iso, iso3_to_country(iso)) for iso in label_iso3]
iso_country_pairs = sorted(iso_country_pairs, key=lambda t: t[0])

legend_text = " | ".join([f"{iso} = {cty}" for iso, cty in iso_country_pairs])

# -----------------------------
# Axis padding
# -----------------------------
xmin, xmax = float(pca_base["PC1"].min()), float(pca_base["PC1"].max())
ymin, ymax = float(pca_base["PC2"].min()), float(pca_base["PC2"].max())

xpad = 0.10 * (xmax - xmin) if xmax > xmin else 1.0
ypad = 0.10 * (ymax - ymin) if ymax > ymin else 1.0

# -----------------------------
# Side-by-side PCA plots with labels
# -----------------------------
fig, axes = plt.subplots(ncols=2, figsize=(16, 6.5), sharex=True, sharey=True)

for ax, k in zip(axes, [2, 3]):
    cluster_df = results[k]
    plot_df = pca_base.copy()
    plot_df["cluster_label"] = cluster_df["cluster_label"].values

    # scatter by cluster label
    for lab in plot_df["cluster_label"].unique():
        sub = plot_df[plot_df["cluster_label"] == lab]
        ax.scatter(sub["PC1"], sub["PC2"], label=lab)

    # label the same extreme points in both panels
    for iso3 in label_iso3:
        r = plot_df.loc[iso3]
        ax.annotate(
            iso3,
            (r["PC1"], r["PC2"]),
            textcoords="offset points",
            xytext=(5, 5),
            fontsize=9,
            bbox=dict(boxstyle="round,pad=0.15", fc="white", ec="none", alpha=0.8)
        )

    ax.set_title(
        f"K-means clusters (k={k}) on shared PCA space\n"
        f"PC1 {pca.explained_variance_ratio_[0]*100:.1f}% | PC2 {pca.explained_variance_ratio_[1]*100:.1f}%",
        fontsize=11,
        fontweight="bold"
    )
    ax.axhline(0, linestyle="--", linewidth=1, alpha=0.4)
    ax.axvline(0, linestyle="--", linewidth=1, alpha=0.4)
    ax.grid(alpha=0.25)

    # legends: force bottom-right for both
    ax.legend(frameon=True, loc="lower right")

# set labels
axes[0].set_xlabel("PC1")
axes[0].set_ylabel("PC2")
axes[1].set_xlabel("PC1")

# widen limits so points don't clip
axes[0].set_xlim(xmin - xpad, xmax + xpad)
axes[0].set_ylim(ymin - ypad, ymax + ypad)

# leave space at the bottom for ISO3=Country legend text
plt.tight_layout()
plt.subplots_adjust(bottom=0.28)

# add ISO3=Country legend below both plots (single shared block)
fig.text(
    0.5, 0.18,
    legend_text,
    ha="center",
    va="center",
    fontsize=11,
    family="monospace"
)

plt.show()

# -----------------------------------------------------
# 7) Compact membership tables
# -----------------------------------------------------
def compact_membership(cluster_df):
    out = (
        cluster_df.reset_index(names="iso3c")
                  .groupby("cluster_label", as_index=False)
                  .agg(
                      n=("iso3c", "count"),
                      countries=("iso3c", lambda s: ", ".join(sorted(s)))
                  )
                  .sort_values("n", ascending=False)
                  .reset_index(drop=True)
    )
    return out

print("\nMembership (k=2):")
display(
    compact_membership(results[2])
    .style
    .set_properties(
        subset=["countries"],
        **{
            "white-space": "normal",
            "max-width": "900px",
            "font-size": "10pt"
        }
    )
)

print("\nMembership (k=3):")
display(
    compact_membership(results[3])
    .style
    .set_properties(
        subset=["countries"],
        **{
            "white-space": "normal",
            "max-width": "900px",
            "font-size": "10pt"
        }
    )
)

Clustering feature matrix X: n_countries=59, n_features=12

Silhouette scores:
  k=2: 0.463
  k=3: 0.204
  k=4: 0.206
  k=5: 0.183
  k=6: 0.179
  k=7: 0.169

--- Cluster summary means (k=2) ---

Cluster sizes:
cluster_label
Non-Strong Governance    54
Strong Governance         5
Name: count, dtype: int64

--- Cluster summary means (k=3) ---

Cluster sizes:
cluster_label
Moderate Governance    28
Weak Governance        26
Strong Governance       5
Name: count, dtype: int64

PCA explained variance ratio: [0.42  0.245]

PC1 loadings (sorted by absolute magnitude):

Top 6 contributors to PC1:

Bottom 6 contributors to PC1:

Membership (k=2):

Membership (k=3):

# =====================================================
# VISUAL: Hierarchical Clustering (robustness check)
#   - Uses same standardized feature matrix as K-means (X_scaled)
#   - Shows:
#       (1) Dendrogram (Ward linkage)
#       (2) Cut-tree cluster labels for k=2 and k=3
#       (3) Compact membership tables (ISO3s)
# =====================================================

# -----------------------------------------------------
# 0) Safety checks
# -----------------------------------------------------
if "X" not in globals() or "X_scaled" not in globals():
    raise ValueError("Need X and X_scaled from the K-means block. Run previous visual first.")

# Ensure index is ISO3 codes
iso3_index = X.index.astype(str).str.upper().str.strip()

# -----------------------------------------------------
# 1) Fit hierarchical clustering (Ward linkage)
#    Ward is a strong default when using Euclidean distance on standardized data
# -----------------------------------------------------
Z = linkage(X_scaled, method="ward")  # Euclidean distance implied

# -----------------------------------------------------
# 2) Dendrogram
# -----------------------------------------------------
fig, ax = plt.subplots(figsize=(18, 7))

dendrogram(
    Z,
    labels=iso3_index.tolist(),
    leaf_rotation=90,
    leaf_font_size=8,
    ax=ax
)

ax.set_title(
    "Hierarchical Clustering Dendrogram (Ward linkage)",
    fontsize=12,
    fontweight="bold"
)
ax.set_xlabel("Countries (ISO3)")
ax.set_ylabel("Ward distance")
ax.grid(axis="y", alpha=0.25)

plt.tight_layout()
plt.show()

# -----------------------------------------------------
# 3) Extract cluster assignments for k = 2 and k = 3
# -----------------------------------------------------
hc_labels = {}
for k in [2, 3]:
    # fcluster returns cluster ids: 1..k
    hc_labels[k] = fcluster(Z, t=k, criterion="maxclust")

# Build DataFrames
hc2 = pd.DataFrame({"iso3c": iso3_index, "hc_cluster": hc_labels[2]}).set_index("iso3c")
hc3 = pd.DataFrame({"iso3c": iso3_index, "hc_cluster": hc_labels[3]}).set_index("iso3c")

# -----------------------------------------------------
# 4) Interpretable labels based on baseline governance level
# -----------------------------------------------------
baseline_cols = [c for c in X.columns if "baseline" in c]
if len(baseline_cols) == 0:
    raise ValueError("No baseline columns found in X (expected columns containing 'baseline').")

def label_by_baseline(cluster_series, baseline_cols, k):
    tmp = X.copy()
    tmp["cluster"] = cluster_series.values

    means = tmp.groupby("cluster")[baseline_cols].mean().mean(axis=1).sort_values()  # low -> high
    order = means.index.tolist()

    if k == 2:
        mapping = {order[0]: "Non-Strong Governance", order[1]: "Strong Governance"}
    elif k == 3:
        mapping = {order[0]: "Weak Governance", order[1]: "Moderate Governance", order[2]: "Strong Governance"}
    else:
        mapping = {cl: f"Cluster {cl}" for cl in order}

    return cluster_series.map(mapping)

hc2["cluster_label"] = label_by_baseline(hc2["hc_cluster"], baseline_cols, k=2)
hc3["cluster_label"] = label_by_baseline(hc3["hc_cluster"], baseline_cols, k=3)

print("\nHierarchical cluster sizes (k=2):")
print(hc2["cluster_label"].value_counts())

print("\nHierarchical cluster sizes (k=3):")
print(hc3["cluster_label"].value_counts())

# -----------------------------------------------------
# 5) Cluster summary means (same features as K-means)
# -----------------------------------------------------
def cluster_summary(cluster_series):
    tmp = X.copy()
    tmp["cluster"] = cluster_series.values
    return tmp.groupby("cluster")[X.columns].mean().round(2)

print("\n--- Hierarchical cluster summary means (k=2) ---")
display(cluster_summary(hc2["hc_cluster"]))

print("\n--- Hierarchical cluster summary means (k=3) ---")
display(cluster_summary(hc3["hc_cluster"]))

# -----------------------------------------------------
# 6) Membership tables (full ISO3 lists, wrapped)
# -----------------------------------------------------
def membership_table(cluster_df):
    out = (
        cluster_df.reset_index(names="iso3c")
                  .groupby("cluster_label", as_index=False)
                  .agg(
                      n=("iso3c", "count"),
                      countries=("iso3c", lambda s: ", ".join(sorted(s)))
                  )
                  .sort_values("n", ascending=False)
                  .reset_index(drop=True)
    )
    return out

print("\nMembership (Hierarchical, k=2):")
display(
    membership_table(hc2)
    .style
    .set_properties(
        subset=["countries"],
        **{"white-space": "normal", "max-width": "900px", "font-size": "10pt"}
    )
)

print("\nMembership (Hierarchical, k=3):")
display(
    membership_table(hc3)
    .style
    .set_properties(
        subset=["countries"],
        **{"white-space": "normal", "max-width": "900px", "font-size": "10pt"}
    )
)

# -----------------------------------------------------
# 7) Compare K-means vs Hierarchical assignments
# -----------------------------------------------------
if "results" in globals() and 2 in results and 3 in results:
    km2 = results[2].copy()
    km3 = results[3].copy()

    compare2 = (
        pd.DataFrame({"iso3c": X.index})
          .set_index("iso3c")
          .assign(kmeans=km2["cluster_label"].values, hierarchical=hc2["cluster_label"].values)
    )
    compare3 = (
        pd.DataFrame({"iso3c": X.index})
          .set_index("iso3c")
          .assign(kmeans=km3["cluster_label"].values, hierarchical=hc3["cluster_label"].values)
    )

    print("\nAgreement table (k=2):")
    display(pd.crosstab(compare2["kmeans"], compare2["hierarchical"]))

    print("\nAgreement table (k=3):")
    display(pd.crosstab(compare3["kmeans"], compare3["hierarchical"]))

Hierarchical cluster sizes (k=2):
cluster_label
Non-Strong Governance    54
Strong Governance         5
Name: count, dtype: int64

Hierarchical cluster sizes (k=3):
cluster_label
Moderate Governance    51
Strong Governance       5
Weak Governance         3
Name: count, dtype: int64

--- Hierarchical cluster summary means (k=2) ---

--- Hierarchical cluster summary means (k=3) ---

Membership (Hierarchical, k=2):

Membership (Hierarchical, k=3):

Agreement table (k=2):

Agreement table (k=3):

# =====================================================
# REGRESSION MODEL:
#   wgi_mean ~ L1(total natural resource rents % GDP)
#   Models: Pooled, Country FE, Country+Year FE
#   SEs   : Clustered by country (iso3c)
# =====================================================

# -----------------------------
# 0) CONFIG
# -----------------------------
Y = "wgi_mean"
X_RENTS = "total_natural_resource_rents_pct_gdp"

USE_LAG = True
LAG_YEARS = 1

WINSORIZE = True
WINSOR_P = 0.01  # 1% tails

# -----------------------------
# 1) Prep + checks
# -----------------------------
needed = {"iso3c", "year", Y, X_RENTS}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"merged_df missing required columns: {sorted(missing)}")

df = merged_df.loc[:, ["iso3c", "year", Y, X_RENTS]].copy()

df["iso3c"] = df["iso3c"].astype(str).str.upper().str.strip()
df["year"] = pd.to_numeric(df["year"], errors="coerce")

df[Y] = pd.to_numeric(df[Y], errors="coerce")
df[X_RENTS] = pd.to_numeric(df[X_RENTS], errors="coerce")

df = df.dropna(subset=["iso3c", "year", Y, X_RENTS]).copy()

# -----------------------------
# 2) winsorization
#    - We winsorize X_RENTS to reduce the influence of extreme outliers.
#    - Specifically, we cap values below the 1st percentile to the 1st percentile,
#    - and values above the 99th percentile to the 99th percentile.
#    - This preserves all observations (unlike trimming/dropping) but limits leverage from extremes.
# -----------------------------
def winsorize_series(s, p=0.01):
    lo = s.quantile(p)
    hi = s.quantile(1 - p)
    return s.clip(lower=lo, upper=hi)

if WINSORIZE:
    df[X_RENTS] = winsorize_series(df[X_RENTS], p=WINSOR_P)

# -----------------------------
# 3) lag
#    - We use a lag of total natural resource rents (% of GDP) to reduce simultaneity
#    - and reverse causality between resource rents and governance (WGI).
#    - Specifically, X at time t−1 is used to explain WGI at time t.
#
#    - This ensures temporal ordering: changes in resource rents precede changes in governance.
#    - The lag is computed within each country (iso3c) after sorting by year.
#
#    - Observations from the first available year per country become missing after lagging
#    - and are dropped from the regression sample.
# -----------------------------
if USE_LAG:
    df = df.sort_values(["iso3c", "year"])
    X = f"{X_RENTS}_L{LAG_YEARS}"
    df[X] = df.groupby("iso3c")[X_RENTS].shift(LAG_YEARS)
else:
    X = X_RENTS

df_reg = df.dropna(subset=[X]).copy()

print(
    f"N={df_reg.shape[0]} rows, "
    f"{df_reg['iso3c'].nunique()} countries, "
    f"years {int(df_reg['year'].min())}–{int(df_reg['year'].max())}"
)

# -----------------------------
# 4) Fit models (clustered SEs)
# We estimate three increasingly specifications:
#
# (1) Pooled OLS:
#     - Uses all country-year observations without fixed effects.
#     - Interprets the association using both between-country and within-country variation.
#
# (2) Country Fixed Effects (FE):
#     - Adds C(iso3c) to control for all time-invariant country characteristics
#     - Identification comes from within-country changes over time.
#
# (3) Two-Way Fixed Effects (TWFE = Country + Year FE):
#     - Adds both C(iso3c) and C(year).
#     - Controls for country-invariant global shocks/common trends in each year
#     - Identification comes from within-country deviations from year-specific averages.
#
# We cluster standard errors at the country level (iso3c) to allow arbitrary
# serial correlation and heteroskedasticity within countries over time.
# This is standard for panel data where errors may be correlated within unit.
# -----------------------------
f_pooled = f"{Y} ~ {X}"
f_ctyfe  = f"{Y} ~ {X} + C(iso3c)"
f_twfe   = f"{Y} ~ {X} + C(iso3c) + C(year)"

def fit_clustered(formula, data):
    return smf.ols(formula, data=data).fit(
        cov_type="cluster",
        cov_kwds={"groups": data["iso3c"]}
    )

m1 = fit_clustered(f_pooled, df_reg)
m2 = fit_clustered(f_ctyfe,  df_reg)
m3 = fit_clustered(f_twfe,   df_reg)

# -----------------------------
# 5) Output table
# -----------------------------
display(summary_col(
    [m1, m2, m3],
    stars=True,
    model_names=["Pooled OLS", "Country FE", "Country+Year FE"],
    info_dict={
        "N": lambda x: f"{int(x.nobs)}",
        "R2": lambda x: f"{x.rsquared:.3f}"
    }
))

# -----------------------------
# 6) coefficient readout
# -----------------------------
b1, se1 = m1.params.get(X, np.nan), m1.bse.get(X, np.nan)
b2, se2 = m2.params.get(X, np.nan), m2.bse.get(X, np.nan)
b3, se3 = m3.params.get(X, np.nan), m3.bse.get(X, np.nan)

print(f"\nMain coefficient on {X}:")
print(f"  Pooled OLS       : {b1:.4f} (SE {se1:.4f})")
print(f"  Country FE       : {b2:.4f} (SE {se2:.4f})")
print(f"  Country + Year FE: {b3:.4f} (SE {se3:.4f})")

N=354 rows, 59 countries, years 2016–2021

Main coefficient on total_natural_resource_rents_pct_gdp_L1:
  Pooled OLS       : -0.0293 (SE 0.0099)
  Country FE       : 0.0004 (SE 0.0012)
  Country + Year FE: 0.0000 (SE 0.0013)

# =====================================================
# ROBUSTNESS CHECK:
#   rents components (oil/gas/mineral) instead of total rents
#   wgi_mean ~ L1(oil_rents) + L1(gas_rents) + L1(mineral_rents)
#   Models: Pooled, Country FE, Country+Year FE
#   SEs   : Clustered by country (iso3c)
# =====================================================

# -----------------------------
# 0) CONFIG
# -----------------------------
Y = "wgi_mean"
X_OIL = "oil_rents_pct_gdp"
X_GAS = "gas_rents_pct_gdp"
X_MIN = "mineral_rents_pct_gdp"

USE_LAG = True
LAG_YEARS = 1

WINSORIZE = True
WINSOR_P = 0.01  # 1% tails

# -----------------------------
# 1) Prep + checks
# -----------------------------
needed = {"iso3c", "year", Y, X_OIL, X_GAS, X_MIN}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"merged_df missing required columns: {sorted(missing)}")

df = merged_df.loc[:, ["iso3c", "year", Y, X_OIL, X_GAS, X_MIN]].copy()

df["iso3c"] = df["iso3c"].astype(str).str.upper().str.strip()
df["year"] = pd.to_numeric(df["year"], errors="coerce")

for c in [Y, X_OIL, X_GAS, X_MIN]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.dropna(subset=["iso3c", "year", Y]).copy()

# -----------------------------
# 2) winsorization
# -----------------------------
def winsorize_series(s, p=0.01):
    lo = s.quantile(p)
    hi = s.quantile(1 - p)
    return s.clip(lower=lo, upper=hi)

if WINSORIZE:
    for c in [X_OIL, X_GAS, X_MIN]:
        df[c] = winsorize_series(df[c], p=WINSOR_P)

# -----------------------------
# 3) lags
# -----------------------------
if USE_LAG:
    df = df.sort_values(["iso3c", "year"])
    X1 = f"{X_OIL}_L{LAG_YEARS}"
    X2 = f"{X_GAS}_L{LAG_YEARS}"
    X3 = f"{X_MIN}_L{LAG_YEARS}"

    df[X1] = df.groupby("iso3c")[X_OIL].shift(LAG_YEARS)
    df[X2] = df.groupby("iso3c")[X_GAS].shift(LAG_YEARS)
    df[X3] = df.groupby("iso3c")[X_MIN].shift(LAG_YEARS)
else:
    X1, X2, X3 = X_OIL, X_GAS, X_MIN

df_reg = df.dropna(subset=[X1, X2, X3]).copy()

print(
    f"N={df_reg.shape[0]} rows, "
    f"{df_reg['iso3c'].nunique()} countries, "
    f"years {int(df_reg['year'].min())}–{int(df_reg['year'].max())}"
)

# -----------------------------
# 4) Fit models (clustered SEs)
# -----------------------------
rhs = f"{X1} + {X2} + {X3}"

f_pooled = f"{Y} ~ {rhs}"
f_ctyfe  = f"{Y} ~ {rhs} + C(iso3c)"
f_twfe   = f"{Y} ~ {rhs} + C(iso3c) + C(year)"

def fit_clustered(formula, data):
    return smf.ols(formula, data=data).fit(
        cov_type="cluster",
        cov_kwds={"groups": data["iso3c"]}
    )

m1 = fit_clustered(f_pooled, df_reg)
m2 = fit_clustered(f_ctyfe,  df_reg)
m3 = fit_clustered(f_twfe,   df_reg)

# -----------------------------
# 5) Output table
# -----------------------------
display(summary_col(
    [m1, m2, m3],
    stars=True,
    model_names=["Pooled OLS", "Country FE", "Country+Year FE"],
    info_dict={
        "N": lambda x: f"{int(x.nobs)}",
        "R2": lambda x: f"{x.rsquared:.3f}"
    }
))

# -----------------------------
# 6) coefficient readout
# -----------------------------
def coef_line(model, name):
    b = model.params.get(name, np.nan)
    se = model.bse.get(name, np.nan)
    return f"{b:.4f} (SE {se:.4f})"

print("\nKey coefficients (lagged component rents):")
print(f"  {X1}: Pooled {coef_line(m1, X1)} | FE {coef_line(m2, X1)} | TWFE {coef_line(m3, X1)}")
print(f"  {X2}: Pooled {coef_line(m1, X2)} | FE {coef_line(m2, X2)} | TWFE {coef_line(m3, X2)}")
print(f"  {X3}: Pooled {coef_line(m1, X3)} | FE {coef_line(m2, X3)} | TWFE {coef_line(m3, X3)}")

N=406 rows, 58 countries, years 2016–2022

Key coefficients (lagged component rents):
  oil_rents_pct_gdp_L1: Pooled -0.0277 (SE 0.0061) | FE -0.0006 (SE 0.0021) | TWFE -0.0006 (SE 0.0027)
  gas_rents_pct_gdp_L1: Pooled 0.0073 (SE 0.0052) | FE -0.0103 (SE 0.0078) | TWFE -0.0097 (SE 0.0082)
  mineral_rents_pct_gdp_L1: Pooled -0.0417 (SE 0.0229) | FE -0.0030 (SE 0.0032) | TWFE -0.0026 (SE 0.0033)

# =====================================================
# ROBUSTNESS CHECK:
#   Governance change on resource rents
#   Δwgi_mean = wgi_mean_t − wgi_mean_{t−1}
#   Δwgi_mean ~ L1(total natural resource rents % GDP)
#   Models: Pooled, Year FE, Country+Year FE
#   SEs   : Clustered by country (iso3c)
# =====================================================

# -----------------------------
# 0) CONFIG
# -----------------------------
Y = "wgi_mean"
X = "total_natural_resource_rents_pct_gdp"

USE_LAG = True
LAG_YEARS = 1

WINSORIZE = True
WINSOR_P = 0.01  # 1% tails

# -----------------------------
# 1) Prep + checks
# -----------------------------
needed = {"iso3c", "year", Y, X}
missing = needed - set(merged_df.columns)
if missing:
    raise ValueError(f"merged_df missing required columns: {sorted(missing)}")

df = merged_df.loc[:, ["iso3c", "year", Y, X]].copy()

df["iso3c"] = df["iso3c"].astype(str).str.upper().str.strip()
df["year"] = pd.to_numeric(df["year"], errors="coerce")

df[Y] = pd.to_numeric(df[Y], errors="coerce")
df[X] = pd.to_numeric(df[X], errors="coerce")

df = df.dropna(subset=["iso3c", "year", Y]).copy()

# -----------------------------
# 2) Construct ΔWGI
# -----------------------------
df = df.sort_values(["iso3c", "year"])
df["delta_wgi"] = df.groupby("iso3c")[Y].diff()

# -----------------------------
# 3) Winsorize rents
# -----------------------------
def winsorize_series(s, p=0.01):
    lo = s.quantile(p)
    hi = s.quantile(1 - p)
    return s.clip(lower=lo, upper=hi)

if WINSORIZE:
    df[X] = winsorize_series(df[X], p=WINSOR_P)

# -----------------------------
# 4) Lag rents
# -----------------------------
if USE_LAG:
    X_L1 = f"{X}_L{LAG_YEARS}"
    df[X_L1] = df.groupby("iso3c")[X].shift(LAG_YEARS)
else:
    X_L1 = X

df_reg = df.dropna(subset=["delta_wgi", X_L1]).copy()

print(
    f"N={df_reg.shape[0]} rows, "
    f"{df_reg['iso3c'].nunique()} countries, "
    f"years {int(df_reg['year'].min())}–{int(df_reg['year'].max())}"
)

# -----------------------------
# 5) Model formulas
# -----------------------------
f_pooled = f"delta_wgi ~ {X_L1}"
f_yearfe = f"delta_wgi ~ {X_L1} + C(year)"
f_twfe   = f"delta_wgi ~ {X_L1} + C(iso3c) + C(year)"

def fit_clustered(formula, data):
    return smf.ols(formula, data=data).fit(
        cov_type="cluster",
        cov_kwds={"groups": data["iso3c"]}
    )

m1 = fit_clustered(f_pooled, df_reg)
m2 = fit_clustered(f_yearfe, df_reg)
m3 = fit_clustered(f_twfe, df_reg)

# -----------------------------
# 6) Output table
# -----------------------------
display(summary_col(
    [m1, m2, m3],
    stars=True,
    model_names=["Pooled OLS", "Year FE", "Country+Year FE"],
    info_dict={
        "N": lambda x: f"{int(x.nobs)}",
        "R2": lambda x: f"{x.rsquared:.3f}"
    }
))

# -----------------------------
# 7) coefficient readout
# -----------------------------
def coef_line(model, name):
    b = model.params.get(name, np.nan)
    se = model.bse.get(name, np.nan)
    return f"{b:.4f} (SE {se:.4f})"

print("\nKey coefficient (lagged total resource rents):")
print(f"  Pooled OLS       : {coef_line(m1, X_L1)}")
print(f"  Year FE          : {coef_line(m2, X_L1)}")
print(f"  Country+Year FE  : {coef_line(m3, X_L1)}")

N=413 rows, 59 countries, years 2016–2022

Key coefficient (lagged total resource rents):
  Pooled OLS       : 0.0004 (SE 0.0003)
  Year FE          : 0.0004 (SE 0.0003)
  Country+Year FE  : 0.0000 (SE 0.0013)

# Setup
merged_df["wgi_mean"] = merged_df[list(WGI_INDICATORS.values())].mean(axis=1)
all_countries = sorted(merged_df['iso3c'].unique())

country_choices = [
    (implementing_df[implementing_df['iso3'] == c]['label'].iloc[0] + f" ({c})", c) 
    for c in all_countries
]

country_selector = SelectMultiple(
    options=country_choices,
    value=['NOR'], 
    description='Countries:',
    rows=15, 
    layout={'width': '400px'}
)

output = Output()

def update_plot(change):
    with output:
        clear_output(wait=True)
        
        selected = list(country_selector.value)
        if not selected:
            return
        
        dfv = merged_df[merged_df['iso3c'].isin(selected)]
        trend = dfv.groupby("year")[list(WGI_INDICATORS.values())].mean().reset_index()
        trend = pd.DataFrame({"year": list(range(2015, 2024))}).merge(trend, how="left")
        
        title = ', '.join(sorted(implementing_df[implementing_df['iso3'].isin(selected)]['label'].tolist()))
        
        # Plot
        fig, axes = plt.subplots(2, 3, figsize=(16, 8), sharex=True)
        axes = axes.flatten()
        
        titles = ["Control of Corruption", "Government Effectiveness", "Political Stability",
                  "Regulatory Quality", "Rule of Law", "Voice and Accountability"]
        
        for ax, col, title_text in zip(axes, list(WGI_INDICATORS.values()), titles):
            ax.axvspan(2015, 2019.99, alpha=0.4, color="#D9D9D9", lw=0)
            ax.axvspan(2020, 2021.99, alpha=0.4, color="#FFE066", lw=0)
            ax.axvspan(2022, 2023.99, alpha=0.4, color="#BFD7EA", lw=0)
            ax.plot(trend["year"], trend[col], marker="o", linewidth=2, color='steelblue')
            ax.axhline(0, linestyle="--", linewidth=1, alpha=0.3)
            ax.set_title(title_text, fontweight="bold", fontsize=11)
            ax.set_xlabel("Year", fontsize=9)
            ax.set_ylabel("Score", fontsize=9)
            ax.grid(True, alpha=0.25)
            ax.set_xticks(list(range(2015, 2024)))
        
        fig.legend(
            handles=[Patch(facecolor=c, alpha=0.4, label=l) for c, l in 
                    [("#D9D9D9", "Pre-pandemic"), ("#FFE066", "Pandemic"), ("#BFD7EA", "Post-COVID")]],
            loc="lower center", ncol=3, bbox_to_anchor=(0.5, -0.02)
        )
        
        fig.suptitle(f"WGI Trends: {title} (2015–2023)", y=1.01, fontsize=15, fontweight="bold")
        plt.tight_layout(rect=[0, 0.06, 1, 1])
        plt.show()

        # Statistics (single print)
        pre_avg = trend[trend['year'].between(2015, 2019)][list(WGI_INDICATORS.values())].mean()
        post_avg = trend[trend['year'].between(2022, 2023)][list(WGI_INDICATORS.values())].mean()
        data_2023 = trend[trend['year'] == 2023].iloc[0]
        
        print(f"\n{'='*80}\nSummary Statistics: {title}\n{'='*80}")
        print("\nWGI Scores (2023):\n" + "-"*80)
        
        scores_2023 = [data_2023[k] for k in list(WGI_INDICATORS.values()) if pd.notna(data_2023[k])]
        for dim_key, dim_name in zip(list(WGI_INDICATORS.values()), titles):
            if pd.notna(data_2023[dim_key]):
                print(f"  {dim_name:30s}: {data_2023[dim_key]:+.3f}")
        
        if scores_2023:
            print(f"\n  WGI MEAN (2023): {np.mean(scores_2023):+.3f}")

        print("\nTOTAL CHANGE:\n" + "-"*80)
        for k, n in zip(list(WGI_INDICATORS.values()), titles):
            print(f"  {n:30s}: {post_avg[k] - pre_avg[k]:+.3f} {'↑' if post_avg[k] > pre_avg[k] else '↓'}")
        
        print("\nRESOURCE RENTS (2015-2021):\n" + "-"*80)
        rent_data = dfv[dfv['year'].between(2015, 2021)]
        for rc, rn in zip(['mineral_rents_pct_gdp', 'oil_rents_pct_gdp', 'gas_rents_pct_gdp', 'total_natural_resource_rents_pct_gdp'],
                          ['Mineral', 'Oil', 'Gas', 'Total']):
            if pd.notna(rent_data[rc].mean()):
                print(f"  {rn:30s}: {rent_data[rc].mean():6.2f}%")

country_selector.unobserve_all() 
country_selector.observe(update_plot, names='value')

display(VBox([HTML("<b>Filter by country:</b> Select countries to compare (updates automatically)"), country_selector, output]))

update_plot(None)

VBox(children=(HTML(value='<b>Filter by country:</b> Select countries to compare (updates automatically)'), Se…

	control_of_corruption_baseline_2019_2021	government_effectiveness_baseline_2019_2021	political_stability_baseline_2019_2021	regulatory_quality_baseline_2019_2021	rule_of_law_baseline_2019_2021	voice_and_accountability_baseline_2019_2021	delta_control_of_corruption	delta_government_effectiveness	delta_political_stability	delta_regulatory_quality	delta_rule_of_law	delta_voice_and_accountability
cluster
0	-0.65	-0.65	-0.62	-0.57	-0.71	-0.36	0.05	0.02	-0.05	-0.02	-0.00	-0.05
1	1.72	1.53	0.63	1.60	1.61	1.34	-0.10	-0.22	-0.13	-0.17	-0.16	-0.01

	control_of_corruption_baseline_2019_2021	government_effectiveness_baseline_2019_2021	political_stability_baseline_2019_2021	regulatory_quality_baseline_2019_2021	rule_of_law_baseline_2019_2021	voice_and_accountability_baseline_2019_2021	delta_control_of_corruption	delta_government_effectiveness	delta_political_stability	delta_regulatory_quality	delta_rule_of_law	delta_voice_and_accountability
cluster
0	1.72	1.53	0.63	1.60	1.61	1.34	-0.10	-0.22	-0.13	-0.17	-0.16	-0.01
1	-0.99	-0.92	-1.09	-0.77	-0.99	-0.78	-0.07	-0.04	-0.13	-0.08	-0.06	-0.15
2	-0.35	-0.40	-0.19	-0.38	-0.45	0.04	0.16	0.08	0.03	0.04	0.05	0.03

	PC1_loading
rule_of_law_baseline_2019_2021	0.430
government_effectiveness_baseline_2019_2021	0.422
control_of_corruption_baseline_2019_2021	0.420
regulatory_quality_baseline_2019_2021	0.409
voice_and_accountability_baseline_2019_2021	0.394
political_stability_baseline_2019_2021	0.336
delta_voice_and_accountability	0.120
delta_regulatory_quality	-0.064
delta_government_effectiveness	-0.062
delta_rule_of_law	-0.056
delta_political_stability	0.004
delta_control_of_corruption	-0.003

	PC1_loading
rule_of_law_baseline_2019_2021	0.430
government_effectiveness_baseline_2019_2021	0.422
control_of_corruption_baseline_2019_2021	0.420
regulatory_quality_baseline_2019_2021	0.409
voice_and_accountability_baseline_2019_2021	0.394
political_stability_baseline_2019_2021	0.336

	PC1_loading
delta_voice_and_accountability	0.120
delta_regulatory_quality	-0.064
delta_government_effectiveness	-0.062
delta_rule_of_law	-0.056
delta_political_stability	0.004
delta_control_of_corruption	-0.003

Natural Resource Revenues and Governance Outcomes

An Empirical Analysis Using EITI and World Bank Data (2015–2023)

ST445 - Managing and Visualizing Data

Candidate IDs: 59423, 60507, 63692

Table of Contents¶

I. Introduction¶

I.1 Background¶

I.2 Research Question and Methodology¶

II. Data Sources and Construction ¶

II.0 Configuration and Global Setup ¶

II.1 World Bank Data (WDI and WGI)¶

II.2 EITI Data ¶

II.3 Data Merging ¶

III. Exploratory Data Analysis ¶

III.0 EDA Set-up ¶

III.1 Extractive Revenues and Governance Trends Over Time ¶

III.2 Governance Trends Across EITI-Implementing Countries¶

III.3 Heterogeneity in Governance Changes Across Countries ¶

III.4 Average Governance Levels Across EITI-Implementing Countries ¶

III.5 Correlation Structure of Governance Dimensions ¶

III.6 Governance Regimes: Clustering Countries by Institutional Quality ¶

III.7 Robustness of Governance Regimes: Hierarchical Clustering ¶

III.8 Resource Rents and Governance: Cross-Section vs Within-Country Effects ¶

III.9 Robustness: Oil, Gas, and Mineral Rents ¶

III.10 Robustness: Governance Changes Instead of Levels ¶

III.11 Country Case Analysis by Governance Regime ¶

Case 1: Norway (Strong Governance, Europe)¶

Case 2: Indonesia (Moderate Governance, Asia)¶

Case 3: Burkina Faso (Weak Governance, Africa)¶

IV. Conclusion ¶

V. References ¶

	cluster_label	n	countries
0	Non-Strong Governance	54	AFG, AGO, ALB, ARG, ARM, AZE, BFA, CIV, CMR, COD, COG, COL, DOM, ECU, ETH, GAB, GHA, GIN, GTM, GUY, HND, IDN, IRQ, KAZ, KGZ, LBR, MDG, MEX, MLI, MMR, MNG, MOZ, MRT, MWI, NER, NGA, PER, PHL, PNG, SEN, SLB, SLE, STP, SUR, SYC, TCD, TGO, TJK, TLS, TTO, TZA, UGA, UKR, ZMB
1	Strong Governance	5	DEU, GBR, NLD, NOR, USA

	cluster_label	n	countries
0	Moderate Governance	28	AGO, ALB, ARG, ARM, CIV, COL, DOM, ECU, GHA, GUY, IDN, KAZ, MNG, MWI, PHL, PNG, SEN, SLB, SLE, STP, SUR, SYC, TGO, TLS, TTO, TZA, UKR, ZMB
1	Weak Governance	26	AFG, AZE, BFA, CMR, COD, COG, ETH, GAB, GIN, GTM, HND, IRQ, KGZ, LBR, MDG, MEX, MLI, MMR, MOZ, MRT, NER, NGA, PER, TCD, TJK, UGA
2	Strong Governance	5	DEU, GBR, NLD, NOR, USA

	cluster_label	n	countries
0	Moderate Governance	51	AFG, AGO, ALB, ARG, ARM, AZE, CIV, CMR, COD, COG, COL, DOM, ECU, ETH, GAB, GHA, GIN, GTM, GUY, HND, IDN, IRQ, KAZ, KGZ, LBR, MDG, MEX, MNG, MOZ, MRT, MWI, NER, NGA, PER, PHL, PNG, SEN, SLB, SLE, STP, SUR, SYC, TCD, TGO, TJK, TLS, TTO, TZA, UGA, UKR, ZMB
1	Strong Governance	5	DEU, GBR, NLD, NOR, USA
2	Weak Governance	3	BFA, MLI, MMR

hierarchical	Moderate Governance	Strong Governance	Weak Governance
kmeans
Moderate Governance	28	0	0
Strong Governance	0	5	0
Weak Governance	23	0	3

	Pooled OLS	Country FE	Country+Year FE
Intercept	-0.1600	-1.6195***	-1.6072***
	(0.1413)	(0.0007)	(0.0096)
total_natural_resource_rents_pct_gdp_L1	-0.0293***	0.0004	0.0000
	(0.0099)	(0.0012)	(0.0013)
C(iso3c)[T.AGO]		0.7004***	0.7094***
		(0.0273)	(0.0303)
C(iso3c)[T.ALB]		1.5686***	1.5689***
		(0.0009)	(0.0010)
C(iso3c)[T.ARG]		1.5425***	1.5428***
		(0.0010)	(0.0011)
C(iso3c)[T.ARM]		1.4000***	1.4002***
		(0.0007)	(0.0007)
C(iso3c)[T.AZE]		0.9111***	0.9183***
		(0.0217)	(0.0241)
C(iso3c)[T.BFA]		1.1062***	1.1101***
		(0.0118)	(0.0131)
C(iso3c)[T.CIV]		1.0454***	1.0464***
		(0.0029)	(0.0032)
C(iso3c)[T.CMR]		0.5390***	0.5409***
		(0.0058)	(0.0065)
C(iso3c)[T.COD]		-0.0036	0.0035
		(0.0216)	(0.0240)
C(iso3c)[T.COG]		0.5036***	0.5146***
		(0.0331)	(0.0368)
C(iso3c)[T.COL]		1.4112***	1.4124***
		(0.0037)	(0.0041)
C(iso3c)[T.DEU]		3.0542***	3.0540***
		(0.0006)	(0.0007)
C(iso3c)[T.DOM]		1.4506***	1.4508***
		(0.0005)	(0.0005)
C(iso3c)[T.ECU]		1.1803***	1.1820***
		(0.0051)	(0.0056)
C(iso3c)[T.ETH]		0.7082***	0.7113***
		(0.0093)	(0.0104)
C(iso3c)[T.GAB]		0.8822***	0.8882***
		(0.0183)	(0.0203)
C(iso3c)[T.GBR]		2.9554***	2.9553***
		(0.0002)	(0.0003)
C(iso3c)[T.GHA]		1.6173***	1.6208***
		(0.0107)	(0.0119)
C(iso3c)[T.GIN]		0.6714***	0.6761***
		(0.0140)	(0.0156)
C(iso3c)[T.GTM]		0.9648***	0.9652***
		(0.0011)	(0.0012)
C(iso3c)[T.GUY]		1.3862***	1.3902***
		(0.0120)	(0.0134)
C(iso3c)[T.HND]		0.9317***	0.9320***
		(0.0010)	(0.0011)
C(iso3c)[T.IDN]		1.4757***	1.4767***
		(0.0030)	(0.0033)
C(iso3c)[T.IRQ]		0.1050**	0.1193**
		(0.0434)	(0.0483)
C(iso3c)[T.KAZ]		1.2351***	1.2405***
		(0.0165)	(0.0183)
C(iso3c)[T.KGZ]		0.9250***	0.9272***
		(0.0066)	(0.0074)
C(iso3c)[T.LBR]		0.8516***	0.8583***
		(0.0204)	(0.0227)
C(iso3c)[T.MDG]		0.8590***	0.8614***
		(0.0073)	(0.0081)
C(iso3c)[T.MEX]		1.1947***	1.1953***
		(0.0019)	(0.0021)
C(iso3c)[T.MLI]		0.6451***	0.6481***
		(0.0091)	(0.0101)
C(iso3c)[T.MMR]		0.5965***	0.5987***
		(0.0069)	(0.0077)
C(iso3c)[T.MNG]		1.5976***	1.6029***
		(0.0162)	(0.0180)
C(iso3c)[T.MOZ]		0.7608***	0.7663***
		(0.0166)	(0.0185)
C(iso3c)[T.MRT]		0.8535***	0.8553***
		(0.0055)	(0.0061)
C(iso3c)[T.MWI]		1.1514***	1.1538***
		(0.0073)	(0.0081)
C(iso3c)[T.NER]		0.8554***	0.8582***
		(0.0086)	(0.0096)
C(iso3c)[T.NGA]		0.5421***	0.5445***
		(0.0073)	(0.0081)
C(iso3c)[T.NLD]		3.2352***	3.2350***
		(0.0004)	(0.0004)
C(iso3c)[T.NOR]		3.3642***	3.3661***
		(0.0057)	(0.0063)
C(iso3c)[T.PER]		1.4603***	1.4618***
		(0.0046)	(0.0051)
C(iso3c)[T.PHL]		1.2794***	1.2795***
		(0.0002)	(0.0003)
C(iso3c)[T.PNG]		1.0196***	1.0250***
		(0.0163)	(0.0181)
C(iso3c)[T.SEN]		1.5231***	1.5242***
		(0.0031)	(0.0034)
C(iso3c)[T.SLB]		1.4267***	1.4340***
		(0.0220)	(0.0244)
C(iso3c)[T.SLE]		0.9934***	0.9974***
		(0.0121)	(0.0134)
C(iso3c)[T.STP]		1.3577***	1.3585***
		(0.0025)	(0.0028)
C(iso3c)[T.SUR]		1.4191***	1.4228***
		(0.0111)	(0.0124)
C(iso3c)[T.SYC]		2.1357***	2.1355***
		(0.0006)	(0.0007)
C(iso3c)[T.TCD]		0.2397***	0.2458***
		(0.0184)	(0.0205)
C(iso3c)[T.TGO]		0.8652***	0.8678***
		(0.0079)	(0.0088)
C(iso3c)[T.TJK]		0.4404***	0.4419***
		(0.0045)	(0.0050)
C(iso3c)[T.TLS]		1.1436***	1.1603***
		(0.0505)	(0.0561)
C(iso3c)[T.TTO]		1.6844***	1.6871***
		(0.0083)	(0.0092)
C(iso3c)[T.TZA]		1.0735***	1.0750***
		(0.0046)	(0.0052)
C(iso3c)[T.UGA]		0.9762***	0.9798***
		(0.0110)	(0.0122)
C(iso3c)[T.UKR]		0.9755***	0.9762***
		(0.0020)	(0.0022)
C(iso3c)[T.USA]		2.7341***	2.7340***
		(0.0002)	(0.0003)
C(iso3c)[T.ZMB]		1.1575***	1.1620***
		(0.0135)	(0.0151)
C(year)[T.2017]			-0.0087
			(0.0071)
C(year)[T.2018]			-0.0092
			(0.0104)
C(year)[T.2019]			-0.0089
			(0.0137)
C(year)[T.2020]			-0.0211
			(0.0155)
C(year)[T.2021]			-0.0245
			(0.0209)
R-squared	0.1416	0.9928	0.9930
R-squared Adj.	0.1391	0.9914	0.9914
N	354	354	354
R2	0.142	0.993	0.993

	Pooled OLS	Year FE	Country+Year FE
Intercept	-0.0055	0.0123	-0.0140
	(0.0046)	(0.0118)	(0.0121)
total_natural_resource_rents_pct_gdp_L1	0.0004	0.0004	0.0000
	(0.0003)	(0.0003)	(0.0013)
C(year)[T.2017]		-0.0249*	-0.0246*
		(0.0133)	(0.0143)
C(year)[T.2018]		-0.0172	-0.0164
		(0.0131)	(0.0142)
C(year)[T.2019]		-0.0164	-0.0158
		(0.0139)	(0.0152)
C(year)[T.2020]		-0.0283**	-0.0281**
		(0.0131)	(0.0141)
C(year)[T.2021]		-0.0191	-0.0194
		(0.0191)	(0.0208)
C(year)[T.2022]		-0.0207	-0.0191
		(0.0158)	(0.0165)
C(iso3c)[T.AGO]			0.0597*
			(0.0311)
C(iso3c)[T.ALB]			0.0330***
			(0.0010)
C(iso3c)[T.ARG]			0.0409***
			(0.0014)
C(iso3c)[T.ARM]			0.0525***
			(0.0018)
C(iso3c)[T.AZE]			0.0404
			(0.0258)
C(iso3c)[T.BFA]			-0.0134
			(0.0147)
C(iso3c)[T.CIV]			0.0643***
			(0.0035)
C(iso3c)[T.CMR]			0.0155**
			(0.0064)
C(iso3c)[T.COD]			0.0244
			(0.0274)
C(iso3c)[T.COG]			0.0316
			(0.0380)
C(iso3c)[T.COL]			0.0376***
			(0.0044)
C(iso3c)[T.DEU]			0.0124***
			(0.0006)
C(iso3c)[T.DOM]			0.0657***
			(0.0007)
C(iso3c)[T.ECU]			0.0590***
			(0.0059)
C(iso3c)[T.ETH]			0.0227**
			(0.0098)
C(iso3c)[T.GAB]			0.0210
			(0.0205)
C(iso3c)[T.GBR]			0.0003*
			(0.0002)
C(iso3c)[T.GHA]			0.0273**
			(0.0124)
C(iso3c)[T.GIN]			0.0210
			(0.0139)
C(iso3c)[T.GTM]			0.0199***
			(0.0013)
C(iso3c)[T.GUY]			0.0506***
			(0.0175)
C(iso3c)[T.HND]			0.0142***
			(0.0011)
C(iso3c)[T.IDN]			0.0665***
			(0.0037)
C(iso3c)[T.IRQ]			0.0280
			(0.0487)
C(iso3c)[T.KAZ]			0.0490**
			(0.0204)
C(iso3c)[T.KGZ]			0.0287***
			(0.0083)
C(iso3c)[T.LBR]			0.0390*
			(0.0231)
C(iso3c)[T.MDG]			0.0326***
			(0.0078)
C(iso3c)[T.MEX]			0.0007
			(0.0023)
C(iso3c)[T.MLI]			-0.0203*
			(0.0118)
C(iso3c)[T.MMR]			-0.0299***
			(0.0080)
C(iso3c)[T.MNG]			0.0330
			(0.0212)
C(iso3c)[T.MOZ]			-0.0012
			(0.0183)
C(iso3c)[T.MRT]			0.0519***
			(0.0072)
C(iso3c)[T.MWI]			0.0355***
			(0.0075)
C(iso3c)[T.NER]			0.0256***
			(0.0092)
C(iso3c)[T.NGA]			0.0227***
			(0.0083)
C(iso3c)[T.NLD]			0.0151***
			(0.0004)
C(iso3c)[T.NOR]			0.0192***
			(0.0071)
C(iso3c)[T.PER]			0.0127*
			(0.0066)
C(iso3c)[T.PHL]			0.0256***
			(0.0005)
C(iso3c)[T.PNG]			0.0279
			(0.0203)
C(iso3c)[T.SEN]			0.0351***
			(0.0036)
C(iso3c)[T.SLB]			0.0487**
			(0.0240)
C(iso3c)[T.SLE]			0.0414***
			(0.0129)
C(iso3c)[T.STP]			0.0505***
			(0.0026)
C(iso3c)[T.SUR]			0.0154
			(0.0121)
C(iso3c)[T.SYC]			0.0694***
			(0.0006)
C(iso3c)[T.TCD]			0.0127
			(0.0211)
C(iso3c)[T.TGO]			0.0469***
			(0.0088)
C(iso3c)[T.TJK]			0.0215***
			(0.0058)
C(iso3c)[T.TLS]			0.0819
			(0.0537)
C(iso3c)[T.TTO]			0.0265***
			(0.0092)
C(iso3c)[T.TZA]			0.0332***
			(0.0055)
C(iso3c)[T.UGA]			0.0245**
			(0.0116)
C(iso3c)[T.UKR]			0.0500***
			(0.0032)
C(iso3c)[T.USA]			-0.0005***
			(0.0001)
C(iso3c)[T.ZMB]			0.0205
			(0.0192)
R-squared	0.0038	0.0203	0.1326
R-squared Adj.	0.0014	0.0034	-0.0299
N	413	413	413
R2	0.004	0.020	0.133

	Pooled OLS	Country FE	Country+Year FE
Intercept	-0.2645**	-1.6330***	-1.6196***
	(0.1265)	(0.0000)	(0.0116)
oil_rents_pct_gdp_L1	-0.0277***	-0.0006	-0.0006
	(0.0061)	(0.0021)	(0.0027)
gas_rents_pct_gdp_L1	0.0073	-0.0103	-0.0097
	(0.0052)	(0.0078)	(0.0082)
mineral_rents_pct_gdp_L1	-0.0417*	-0.0030	-0.0026
	(0.0229)	(0.0032)	(0.0033)
C(iso3c)[T.AGO]		0.7624***	0.7609***
		(0.0499)	(0.0637)
C(iso3c)[T.ALB]		1.5885***	1.5884***
		(0.0024)	(0.0031)
C(iso3c)[T.ARG]		1.5370***	1.5367***
		(0.0032)	(0.0040)
C(iso3c)[T.ARM]		1.4237***	1.4231***
		(0.0056)	(0.0058)
C(iso3c)[T.AZE]		0.9777***	0.9748***
		(0.0458)	(0.0583)
C(iso3c)[T.BFA]		1.1044***	1.1025***
		(0.0169)	(0.0175)
C(iso3c)[T.CIV]		1.0929***	1.0924***
		(0.0033)	(0.0037)
C(iso3c)[T.CMR]		0.5590***	0.5588***
		(0.0051)	(0.0066)
C(iso3c)[T.COD]		0.0394*	0.0370*
		(0.0207)	(0.0217)
C(iso3c)[T.COG]		0.5587***	0.5572***
		(0.0553)	(0.0704)
C(iso3c)[T.COL]		1.4337***	1.4334***
		(0.0059)	(0.0076)
C(iso3c)[T.DEU]		3.0583***	3.0582***
		(0.0001)	(0.0001)
C(iso3c)[T.DOM]		1.4926***	1.4922***
		(0.0036)	(0.0037)
C(iso3c)[T.ECU]		1.2053***	1.2050***
		(0.0101)	(0.0128)
C(iso3c)[T.ETH]		0.7141***	0.7141***
		(0.0003)	(0.0004)
C(iso3c)[T.GAB]		0.9206***	0.9198***

Natural Resource Revenues and Governance Outcomes

An Empirical Analysis Using EITI and World Bank Data (2015–2023)

ST445 - Managing and Visualizing Data

Candidate IDs: 59423, 60507, 63692

Table of Contents¶

I. Introduction¶

I.1 Background¶

I.2 Research Question and Methodology¶

II. Data Sources and Construction¶

II.0 Configuration and Global Setup¶

II.1 World Bank Data (WDI and WGI)¶

II.2 EITI Data¶

II.3 Data Merging¶

III. Exploratory Data Analysis¶

III.0 EDA Set-up¶

III.1 Extractive Revenues and Governance Trends Over Time ¶

III.2 Governance Trends Across EITI-Implementing Countries¶

III.3 Heterogeneity in Governance Changes Across Countries¶

III.4 Average Governance Levels Across EITI-Implementing Countries¶

III.5 Correlation Structure of Governance Dimensions¶

III.6 Governance Regimes: Clustering Countries by Institutional Quality¶

III.7 Robustness of Governance Regimes: Hierarchical Clustering¶

III.8 Resource Rents and Governance: Cross-Section vs Within-Country Effects¶

III.9 Robustness: Oil, Gas, and Mineral Rents¶

III.10 Robustness: Governance Changes Instead of Levels¶

III.11 Country Case Analysis by Governance Regime¶

Case 1: Norway (Strong Governance, Europe)¶

Case 2: Indonesia (Moderate Governance, Asia)¶

Case 3: Burkina Faso (Weak Governance, Africa)¶

IV. Conclusion¶

V. References¶

II. Data Sources and Construction ¶

II.0 Configuration and Global Setup ¶

II.2 EITI Data ¶

II.3 Data Merging ¶

III. Exploratory Data Analysis ¶

III.0 EDA Set-up ¶

III.3 Heterogeneity in Governance Changes Across Countries ¶

III.4 Average Governance Levels Across EITI-Implementing Countries ¶

III.5 Correlation Structure of Governance Dimensions ¶

III.6 Governance Regimes: Clustering Countries by Institutional Quality ¶

III.7 Robustness of Governance Regimes: Hierarchical Clustering ¶

III.8 Resource Rents and Governance: Cross-Section vs Within-Country Effects ¶

III.9 Robustness: Oil, Gas, and Mineral Rents ¶

III.10 Robustness: Governance Changes Instead of Levels ¶

III.11 Country Case Analysis by Governance Regime ¶

IV. Conclusion ¶

V. References ¶