##################################################################################################
### Program: SimTrade_S&P500_returns_and_volatility_2025_12_23_SB                              ###
### Author: Saral Bindal                                                                       ###
### Contact: saralbindal.24@kgpian.iitkgp.ac.in                                                ###
### Article on the SimTrade blog: https://www.simtrade.fr/blog_simtrade/historical-volatility/ ###
##################################################################################################


# Python program used to calculate the historical volatility of the S&P 500 index using daily data over the period 2020–2025.

# Objectives of the program:
#   1) Compute daily arithmetic and logarithmic returns S&P 500 index (2020-2025) data and plot their empirical distribution.
#   2) Compute the first four statistical moments (mean, variance, skewness, and kurtosis) for both return measures.
#   3) Compute the annualized and rolling historical volatility and plot a chart for the same.


####################################################################################################
### Organization of the program:                                                                 ###
###   STEP 1: Environment setup and package loading                                              ###
###   STEP 2: Downloading market data                                                            ###
###   STEP 3: Computation and visualization of returns                                           ###
###   STEP 4: Computation and visualization of Historical volatility                             ###
####################################################################################################


####################################################################################################
### Parameters to change:                                                                        ###
###   Rolling window length for volatility estimation                                            ###
####################################################################################################


#################
# Documentation #
#################

# Python packages 
# https://pandas.pydata.org/docs/
# https://numpy.org/doc/stable/
# https://matplotlib.org/stable/
# https://docs.scipy.org/doc/scipy/
# https://ranaroussi.github.io/yfinance/

# Academic articles
# Bollerslev, T. (1986). Generalized Autoregressive Conditional Heteroskedasticity, Journal of Econometrics, 31(3), 307–327.
# Engle, R. F. (1982). Autoregressive Conditional Heteroscedasticity with Estimates of the Variance of United Kingdom Inflation, Econometrica, 50(4), 987–1007.
# Fama, E. F., & French, K. R. (2004). The Capital Asset Pricing Model: Theory and Evidence, Journal of Economic Perspectives, 18(3), 25–46.
# Heston, S. L. (1993). A Closed-Form Solution for Options with Stochastic Volatility with Applications to Bond and Currency Options, The Journal of Finance, 48(3), 1–24.
# Markowitz, H. M. (1952). Portfolio Selection, The Journal of Finance, 7(1), 77–91.
# Parkinson, M. (1980). The extreme value method for estimating the variance of the rate of return. Journal of Business, 53(1), 61–65.
# Sharpe, W. F. (1964). Capital Asset Prices: A Theory of Market Equilibrium under Conditions of Risk, The Journal of Finance, 19(3), 425–442.
# Tsay, R. S. (2010). Analysis of financial time series, John Wiley & Sons.


#################################################
# STEP 1: Environment setup and package loading #       
#################################################

import sys
import subprocess

# Function to install a package if it is not already installed
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Import pandas (data manipulation)
try:
    import pandas as pd
except ImportError:
    install("pandas")
    import pandas as pd

# Import numpy (numerical computations)
try:
    import numpy as np
except ImportError:
    install("numpy")
    import numpy as np

# Import matplotlib (data visualization)
try:
    import matplotlib.pyplot as plt
except ImportError:
    install("matplotlib")
    import matplotlib.pyplot as plt

# Import scipy.stats (statistical moments)
try:
    import scipy.stats as stats
except ImportError:
    install("scipy")
    import scipy.stats as stats

# Import yfinance (market data download)
try:
    import yfinance as yf
except ImportError:
    install("yfinance")
    import yfinance as yf


######################################
# STEP 2: Downloading of market data #       
######################################

# Download daily S&P 500 index data for the last 5 years
data = yf.download("^GSPC", period="5y", interval="1d")
data.head()

# Remove unnecessary OHLC and volume columns
data = data.drop([('High', '^GSPC'), ('Low', '^GSPC'),
                  ('Open', '^GSPC'), ('Volume', '^GSPC')], axis=1)

# Rename column and reset index
data.columns = ['Close']
data = data.reset_index()
print(data)


####################################################
# STEP 3: Computation and visualization of returns #
####################################################

# Compute daily arithmetic returns
data["Arithmetic_Return"] = data["Close"].pct_change()

# Compute daily logarithmic returns
data["Log_Return"] = np.log(data["Close"] / data["Close"].shift(1))

# Remove missing values created by lagged computations
data = data.dropna()
print(data)


# Extract logarithmic returns
log_ret = data["Log_Return"].dropna()

# Compute descriptive statistics for log returns
mean_log = log_ret.mean()
variance_log = log_ret.var()
skewness_log = log_ret.skew()
kurtosis_log = log_ret.kurt()
std_log = log_ret.std()
annualized_vol = std_log * np.sqrt(252)

# Display statistics for logarithmic returns
print("For Logarithmic return:")
print("Mean:", mean_log)
print("Variance:", variance_log)
print("Skewness:", skewness_log)
print("Kurtosis:", kurtosis_log)
print(f"Standard Deviation / Daily Volatility: {std_log*100}%")
print(f"Annualized Volatility: {annualized_vol*100}%")
print("\n")

# Plot histogram of logarithmic returns
plt.figure(figsize=(10, 6))
plt.hist(data["Log_Return"].dropna() * 100, bins=70, edgecolor='black')
plt.title("S&P 500 Daily Returns (Logarithmic)")
plt.xlabel("% Return")
plt.ylabel("Frequency")
plt.grid(True, alpha=0.3)
plt.show()



# Extract arithmetic returns
arith_ret = data["Arithmetic_Return"].dropna()

# Compute descriptive statistics for arithmetic returns
mean_arith = arith_ret.mean()
variance_arith = arith_ret.var()
skewness_arith = arith_ret.skew()
kurtosis_arith = arith_ret.kurt()
std_arith = arith_ret.std()
annualized_vol_arith = std_arith * np.sqrt(252)

# Display statistics for arithmetic returns
print("For Arithmatic return:")
print("Mean:", mean_arith)
print("Variance:", variance_arith)
print("Skewness:", skewness_arith)
print("Kurtosis:", kurtosis_arith)
print(f"Standard Deviation / Daily Volatility: {std_arith * 100}%")
print(f"Annualized Volatility: {annualized_vol_arith * 100}%")
print("\n")

# Plot histogram of arithmetic returns
plt.figure(figsize=(10, 6))
plt.hist(data["Arithmetic_Return"].dropna() * 100, bins=70, edgecolor='black')
plt.title("S&P 500 Daily Returns (Arithmetic)")
plt.xlabel("% Return")
plt.ylabel("Frequency")
plt.grid(True, alpha=0.3)
plt.show()


##################################################################
# STEP 4: Computation and visualization of Historical volatility #
##################################################################

# Function to compute x-month annualized historical volatility
def x_month_hv(x, data):
    days = int(x * 21)         # Approximate trading days per month
    log_ret = data.tail(days)  # Extract return window
    daily_vol = log_ret.std()  # Daily volatility
    hv = daily_vol * np.sqrt(252)
    return hv * 100

# Historical volatility using logarithmic returns
print("For Logarithmic return:")
print(f"1-Month HV: {x_month_hv(1, data['Log_Return']):.2f}%")
print(f"3-Month HV: {x_month_hv(3, data['Log_Return']):.2f}%")
print(f"6-Month HV: {x_month_hv(6, data['Log_Return']):.2f}%")
print("\n")

# Historical volatility using arithmetic returns
print("For Arithmetic return:")
print(f"1-Month HV: {x_month_hv(1, data['Arithmetic_Return']):.2f}%")
print(f"3-Month HV: {x_month_hv(3, data['Arithmetic_Return']):.2f}%")
print(f"6-Month HV: {x_month_hv(6, data['Arithmetic_Return']):.2f}%")
print("\n")


# Function to compute and plot rolling historical volatility
def historical_volatility(log_returns, months=1, trading_days=21, plot=True):
    df = pd.DataFrame({"Log_Return": log_returns})
    window = months * trading_days
    df["Rolling_HV"] = df["Log_Return"].rolling(window).std() * np.sqrt(252) * 100

    if plot:
        plt.figure(figsize=(12, 6))
        plt.plot(df["Rolling_HV"])
        plt.title(f"{months}-Month Historical Volatility (Annualized)")
        plt.xlabel("Date")
        plt.ylabel("Volatility (%)")
        plt.grid(True)
        plt.show()

    return df

# Compute and plot 3-month rolling historical volatility
historical_volatility(data["Log_Return"], months=3)


#End of the code