##########################################################
### Program: Risk_measures_analysis_2023_01_08_FL_V1 #####
##########################################################

# Authors: Francois LONGIN and Shengyu ZHENG (ESSEC Business School)

# Objective of this file: estimate different risk measures for financial assets

# Structure of this file: 
#   Step 1: definition of the input and output files 
#   Step 2: reading of input files (data)
#   Step 3: computation of basic statistics
#   Step 4: definition of parameters for risk measures
#   Step 5: computation of risk measures
#   Step 6: production of output files


 # Language used by R to display the error messages 
 Sys.setenv(LANG = "en")

 # Suppression of all data in the R memory
 rm(list=ls())

 # Clear the console in RStudio
 cat("\014") 

 # Install required R packages if need

 if (!require(rstudioapi)) install.packages('rstudioapi')
 if (!require(lubridate)) install.packages('lubridate')
 if (!require(dplyr)) install.packages('dplyr')
 if (!require(quantmod)) install.packages('quantmod')
 if (!require(moments)) install.packages('moments')
 if (!require(MASS)) install.packages('MASS')
 if (!require(POT)) install.packages('POT')
 if (!require(cvar)) install.packages('cvar')
 
 # R packages used in the R code
 library(rstudioapi) # R function from the rstudioapi to get the name of this R program 
 library(lubridate) #formatting dates
 library(dplyr) #data manipulation 
 library(quantmod) #retrieving financial data from Yahoo Finance
 library(moments) # Descriptive Statistics
 library(MASS) #fit normal distribution
 library(POT) # Peak over threshold (POT) approach: estimation of the Generalized Pareto distribution (GDP)
 library(cvar) # Calculation of expected shortfall or stress value based on a theoretical distribution


 # Definition of the working directory
 # Adapt the path to the working directory according to the location of the files on your computer and the type of computer (PC/Mac)
 wd = "G:/1. SimTrade/0.  Blog SimTrade/1. Billets en cours de redaction/2022-05 Shengyu ZHENG/Z Risk estimation with R/1. Program/Latest version" # For Francois Longin (PC)

 # Setting the working directory for the R code
 setwd(wd)

 
# ************************************************************
# ***** Step 1: definition of the input and output files *****
# ************************************************************

 # *** Definition of the names of the output files ***

 # Output file for the execution of the program
 date_execution = format(Sys.time(), "%Y_%m_%d_%H_%M") # Date (and time) of execution of the program (format YYYY_MM_DD_HH_MM)
 file_output1 = paste("3_Results_Risk_measures_analysis_Execution_",date_execution,".csv", sep = "")
 #file_output1 = paste("3_Results_Risk_measures_analysis_Execution_",date_execution,".csv", sep = "") # Use this if codes are run on R Studio Cloud
 
 # Output file for the estimation results of risk measures
 date_estimation <- as.POSIXct(today(), format = "%d/%m/%Y") 
 file_output2 = paste("Output_risk_measures_",date_execution,".csv", sep = "")
 
 # Output file for the data downloaded (from Yahoo Finance for example)
 file_output3 = paste("Data_",date_execution,".csv", sep = "")
 
 # *** Information for the execution file ***

 # R function from the rstudioapi to get the name of this R program 
 RFileName <- rstudioapi::getActiveDocumentContext()$path

 a <- paste("Execution of the R program: ", RFileName)
 write(a, file=file_output1)
 write("", file=file_output1, append = TRUE)

 # Starting time of the R program
 starting_time = Sys.time()
 a <- paste("Starting time of the R program:", starting_time)
 write(a, file=file_output1, append = TRUE) # Writing in the same file (use of "Append = TRUE")
 write("", file=file_output1, append = TRUE) 

 # Version of R used in this program use the 'R.version' variable
 a <- paste("R version used to execute the program: ", R.version$version.string)
 write(a, file=file_output1, append = TRUE)
 write("", file=file_output1, append = TRUE)

 # Data frame to store all estimation results of risk measures 
 risk_measures <- data.frame(matrix(ncol = 35, nrow = 0))
 colnames(risk_measures) <- c("AssetClass", "Instrument", "Mean", "Median", "StandardDeviation", "Variance", "Skewness", "Kurtosis",
                              "RM1_HistoricalVol", "RM1_StartDate", "RM1_EndDate", "RM1_NObs",
                              "RM2_3MVol", "RM2_StartDate", "RM2_EndDate", "RM2_NObs",
                              "RM3_VaR_historical_left", "RM3_VaR_historical_right", "RM3_VaR_normal_right", "RM3_VaR_normal_left", "RM3_StartDate", "RM3_EndDate", "RM3_NObs",
                              "RM4_ES_historical_left", "RM4_ES_historical_right", "RM4_ES_normal_left", "RM4_ES_normal_right", "RM4_StartDate", "RM4_EndDate", "RM4_NObs",
                              "RM5_SV_gpd_left", "RM5_SV_gpd_right", "RM5_StartDate", "RM5_EndDate", "RM5_NObs")

 # Import of the Yahoo Finance tickers
 # To know the starting date of the time-series on Yahoo Finance, click on "max" or enter 01/01/1900 in the csv file and check the date in the output file 
  
 # Choice between two approaches
 #choice_ticker <- 1 # The user enters the Yahoo Finance tickers as an input
 choice_ticker <- 2 # The Yahoo Finance tickers are read from an external file 
 
 if (choice_ticker == 1) { # The Yahoo Finance tickers are entered by the user as an input
   Yahoo_Finance_Tickers           <- data.frame(matrix(ncol = 5, nrow = 0))
   colnames(Yahoo_Finance_Tickers) <- c("Ticker","Asset.Class", "Instrument","Start.Date","End.Date")
  
   Ticker_Input        <- readline(prompt = "Please enter the Yahoo Finance Ticker: ")
   Asset.Class_Input   <- readline(prompt = "Please enter the Asset Class of this asset: ")
   Instrument_Input    <- readline(prompt = "Please enter the name of this asset: ")
   Start.Date_Input    <- readline(prompt = "Please enter the start date of calculation in MMDDYYYY: ")
   End.Date_Input      <- readline(prompt = "Please enter the end date of calculation in MMDDYYYY: ")
   
   
   Start.Date_Input    <- as.Date(Start.Date_Input,format = "%m%d%Y")
   End.Date_Input      <- as.Date(End.Date_Input,format = "%m%d%Y")
   
   Yahoo_Finance_Tickers_Input           <- data.frame(Ticker_Input,Asset.Class_Input,Instrument_Input,
                                                       Start.Date_Input,End.Date_Input)
   colnames(Yahoo_Finance_Tickers_Input) <- c("Ticker","Asset.Class", "Instrument","Start.Date","End.Date")

   for (i in (year(Start.Date_Input)+1):(year(End.Date_Input)-1)) {
     Yahoo_Finance_Tickers_Input2<- data.frame(Ticker_Input,Asset.Class_Input,Instrument_Input,
                                               as.Date(ISOdate(i,1,1),format = "%m%d%Y"),
                                               as.Date(ISOdate(i,12,31),format = "%m%d%Y"))
     colnames(Yahoo_Finance_Tickers_Input2) <- c("Ticker","Asset.Class", "Instrument","Start.Date","End.Date")
     
     Yahoo_Finance_Tickers_Input <- rbind(Yahoo_Finance_Tickers_Input, 
                                          Yahoo_Finance_Tickers_Input2)
   }
  
   Yahoo_Finance_Tickers <- rbind(Yahoo_Finance_Tickers,Yahoo_Finance_Tickers_Input)
 }
 
 if (choice_ticker == 2) {  # The Yahoo Finance tickers are read from an external file 
   Yahoo_Finance_Tickers <- read.csv(file="Yahoo_Finance_Tickers_2023_01_08B.csv",
                                     header = TRUE, sep=";") # separators could be a comma 
 } 
 
 
 

# Loop on the assets (Yahoo_Finance_Tickers)
 
for (i in 1:nrow(Yahoo_Finance_Tickers)){

  # *************************************************
  # ********** Step 2: getting data input ***********
  # *************************************************
  
  # Retrieve data from Yahoo Finance with the getSymbol function from the QuantMod R package
    data <- getSymbols(Yahoo_Finance_Tickers$Ticker[i], 
                        from = as.Date(Yahoo_Finance_Tickers$Start.Date[i],format = "%m/%d/%Y"), 
                        to   = as.Date(Yahoo_Finance_Tickers$End.Date[i],format = "%m/%d/%Y"),
                        env = NULL) 

    data <- data[!is.na(data[,6]),] #remove days with NA values for price (if any)
    
    # DataFile (name of the data file used for the estimation)
    DataFile <- file_output3
    
    # DataSource (origin of the data)
    DataSource <- "Yahoo Finance"
    
    # Enter the asset class and the instrument under study
    AssetClass = Yahoo_Finance_Tickers$Asset.Class[i]  
    Instrument = Yahoo_Finance_Tickers$Instrument[i] 
     
    # Compute logarithmic percentage returns (with respect to the daily closing prices)
    data$return = diff(log(data[,6]))*100
    # CFL_2022_11_28: to be checked (concordance with the date)
    data$return[1]=0
     
    # Convert the xts object (from the QuantMod R package) to a data frame and process dates 
    data <- data.frame(Date=index(data), coredata(data))
    data$Date <- as.POSIXct(data$Date)
    
    # Compute the number of observations, start date and end date of the time series
    Number_observations <- nrow(data)
    StartDate           <- min(data$Date)
    EndDate             <- max(data$Date)
  
    data <- cbind(Ticker = Yahoo_Finance_Tickers$Ticker[i], data)
    
    
    write.table(data, file_output3, sep = ",", append = T,
                col.names = !file.exists("myDF.csv"),row.names = F)
   
  # ***************************************************
  # ***** Step 3: computation of basic statistics *****
  # ***************************************************
  
    # Basic statistics (unconditional statistics computed over the entire period from 'Start date' to 'End date' for each time-series)
    summary(data$return)
    mean1    <- mean(data$return)
    median1  <- median(data$return)
    sd1      <- sd(data$return)
    var1     <- sd1**2
    kurt1    <- kurtosis(data$return)
    skew1    <- skewness(data$return)
    head(data$return, 6)
   
   
  # **************************************************************
  # ***** Step 4: definition of parameters for risk measures *****
  # **************************************************************
  
    # Length of the time period to estimate conditional volatility
    EstimationPeriodConditionalVolatility <- 60 # in days
    
    # Probability to compute VaR and ES
    Probability_VaR <- 0.95
    Probability_ES  <- 0.95
    
    # Probability to compute SV
    Probability_SV  <- 0.99
  
  
  # ************************************************
  # ***** Step 5: computation of risk measures *****
  # ************************************************
 
    # Returns are used to study a short position and  opposite returns for a long position
    data$opposite_return <- - data$return  
    
    # Estimation of the Gaussian distribution 
    Gaussian_fitting_right  <-  fitdistr(data$return,"normal")
    Gaussian_fitting_left   <-  fitdistr(data$opposite_return,"normal")
     
    # Estimation of the Generalized Pareto distribution (GPD) for the left and right tails of the distribution (use of the POT R package)
    
    # Threshold selection is a primordial, yet tricky subject to the correct and effective implementation of the POT-GPD framework. 
    # A commonly used, whereas oversimplified and not necessarily asymptotically correct, approach is to select a high quantile of the date, say the 95% percentile value. 
    # Otherwise, graphical selection for likelihood-based inference is often employed for determining the appropriate threshold.
    # Here, we adopt the simplest high-quantile approach for the purpose of easier automation. 
    # Bearing in mind that this approach is by no means optimal, the user is high recommended to consider other ways of threshold selection for more robust results. 
     
    GPD_fitting_right <- fitgpd(data$return, quantile(data$return,probs=Probability_VaR), est="mle")
    GPD_fitting_left  <- fitgpd(data$opposite_return, quantile(data$opposite_return,probs=Probability_VaR), est="mle")
     
    summary(GPD_fitting_right,silent=FALSE)
     
    # Historical volatility (annualized standard deviation of returns computed over the entire period / from 'Start date' to 'End date' for each time-series)
    RM1_StartDate  <- as.Date(min(data$Date), format = "%d/%m/%Y")
    RM1_EndDate    <- as.Date(max(data$Date), format = "%d/%m/%Y")
    RM1            <- data %>% filter(Date>=RM1_StartDate & Date <= RM1_EndDate)
    RM1_NObs       <- nrow(data)
    # CFL_2022_11_28: 252 (to be generalized as a parameter for each asset)
    RM1_HistoricalVol <- sd(data$return)*sqrt(252) # Annualized volatility
    
    # 3-month volatility (standard deviation of returns computed over a 3-month period / from 'End date - Estimation_period_Conditional_Volatility' to 'End date' for each time-series)
    # CFL_2022_11_28: 3 months or EstimationPeriodConditionalVolatility?
    RM2_StartDate  <- as.Date(max(data$Date)) %m-% months(3)
    RM2_EndDate    <- as.Date(max(data$Date))
    RM2            <- data %>% filter(Date>=RM2_StartDate & Date <= RM2_EndDate)
    RM2_NObs       <- nrow(RM2)
    RM2_3MVol      <- sd(RM2$return)*sqrt(252) # Annualized volatility
    
    # Value at Risk (VaR)
    RM3_StartDate  <- as.Date(min(data$Date))
    RM3_EndDate    <- as.Date(max(data$Date))
    RM3            <- data %>% filter(Date>=RM3_StartDate & Date <= RM3_EndDate)
    RM3_NObs       <- nrow(RM3)
    
    # Value at Risk (VaR) - Historical method
    RM3_VaR_historical_right  <- quantile(RM3$return,probs=Probability_VaR)
    RM3_VaR_historical_left   <- quantile(RM3$opposite_return,probs=Probability_VaR)
    
    # Value at Risk (VaR) - Parametric method - Normal distribution
    RM3_VaR_normal_left    <- qnorm(Probability_VaR,
                                    mean = Gaussian_fitting_left$estimate[1], 
                                    sd = Gaussian_fitting_left$estimate[2])
    RM3_VaR_normal_right   <- qnorm(Probability_VaR,
                                    mean = Gaussian_fitting_right$estimate[1], 
                                    sd = Gaussian_fitting_right$estimate[2])
     
    # Expected shortfall (ES) 
    RM4_StartDate  <- as.Date(min(data$Date))
    RM4_EndDate    <- as.Date(max(data$Date))
    RM4            <- data %>% filter(Date>=RM4_StartDate & Date <= RM4_EndDate)
    RM4_NObs       <- nrow(RM4)
    
    # Expected shortfall (ES) - Historical method
    RM4_ES_historical_right   <- mean(RM4$return[which(RM4$return>=quantile(RM4$return,probs=Probability_ES))])
    RM4_ES_historical_left    <- mean(RM4$opposite_return[which(RM4$opposite_return>=quantile(RM4$opposite_return,probs=Probability_ES))])
    
    # Expected shortfall (ES) - Parametric method - Normal distribution
    RM4_ES_normal_left   <- cvar::ES(qnorm, x = 1- Probability_ES, dist.type = "qf",
                                     mean = Gaussian_fitting_left$estimate[1], 
                                     sd = Gaussian_fitting_left$estimate[2])
    RM4_ES_normal_right   <- cvar::ES(qnorm, x = 1- Probability_ES, dist.type = "qf",
                                      mean = Gaussian_fitting_right$estimate[1], 
                                      sd = Gaussian_fitting_right$estimate[2])
     
    # Stress value (SV) - Parametric method - General Pareto distribution (GPD)
    RM5_StartDate  <- as.Date(min(data$Date))
    RM5_EndDate    <- as.Date(max(data$Date))
    RM5            <- data %>% filter(Date>=RM5_StartDate & Date <= RM5_EndDate)
    RM5_NObs       <- nrow(RM5) 
    RM5_SV_gpd_left   <- qgpd(Probability_SV,
                              scale = GPD_fitting_left$fitted.values[1],
                              shape = GPD_fitting_left$fitted.values[2])
    RM5_SV_gpd_right  <- qgpd(Probability_SV,
                              scale = GPD_fitting_right$fitted.values[1],
                              shape = GPD_fitting_right$fitted.values[2]) 
     
  
  # **********************************************
  # ***** Step 6: production of output files *****
  # **********************************************

    # Output CSV file with the following column names:
    # DataFile (name of the data file used for the estimation)
    # DataSource (origin of the data)
    # AssetClass (Equity, Bond, Foreign exchange, Commodities, Crypto, etc.)
    # Instrument (S&P 500 index, CAC 40 index, etc.)
    # StartDate (start date of the time series)
    # EndDate (end date of the time series)
    # NumberObservations (number of observations of the time series) 
    # Mean (unconditional mean computed from 'Start date' to 'End date' for each time series)
    # Standard deviation (unconditional standard deviation computed from 'Start date' to 'End date' for each time series)
    # Variance (unconditional variance computed from 'Start date' to 'End date' for each time series)
    # Skewness (unconditional skewness computed from 'Start date' to 'End date' for each time series)
    # Kurtosis (unconditional kurtosis computed from 'Start date' to 'End date' for each time series)
    # RiskMeasure1 (Historical volatility)
    # StartDateEstimationHistoricalVolatility (start date for the period of estimation of historical volatility)
    # EndDateEstimationHistoricalVolatility (end date for the period of estimation of historical volatility)
    # NumberObservationsEstimationHistoricalVolatility (number of observations in the period of estimation of historical volatility)
    # HistoricalVolatilityMeasure
    # RiskMeasure2 (3-month volatility)
    # StartDateEstimation3MonthVolatility (start date for the period of estimation of 3-month volatility)
    # EndDateEstimation3MonthVolatility (end date for the period of estimation of 3-month volatility)
    # NumberObservationsEstimation3MonthVolatility (number of observations in the period of estimation of 3-month volatility)
    # 3MonthVolatilityMeasure
    # RiskMeasure3 (Value at Risk)
    # StartDateEstimationValueAtRisk (start date for the period of estimation of value at risk)
    # EndDateEstimationValueAtRisk (end date for the period of estimation of value at risk)
    # NumberObservationsEstimationValueAtRisk (number of observations in the period of estimation of value at risk)
    # ValueAtRiskMeasure
    # RiskMeasure4 (Expected shortfall)
    # StartDateEstimationExpectedShortfall (start date for the period of estimation of expected shortfall)
    # EndDateEstimationExpectedShortfall (end date for the period of estimation of expected shortfall)
    # NumberObservationsEstimationExpectedShortfall (number of observations in the period of estimation of expected shortfall)
    # ExpectedShortfallMeasure
    # RiskMeasure5 (Stress value)
    # StartDateEstimationStressValue (start date for the period of estimation of stress value)
    # EndDateEstimationv (end date for the period of estimation of stress value)
    # NumberObservationsEstimationv (number of observations in the period of estimation of stress value)
    # StressValueMeasure
    # RFileName (name of the R program file used for the estimation of risk measures)
  
    # Add the new result to the data frame that stores all risk measures 
    new_entry <- data.frame(DataFile, DataSource, AssetClass, Instrument, mean1, median1, sd1, var1, skew1, kurt1,
                            RM1_StartDate, RM1_EndDate, RM1_NObs, RM1_HistoricalVol,
                            RM2_StartDate, RM2_EndDate, RM2_NObs, RM2_3MVol,
                            RM3_StartDate, RM3_EndDate, RM3_NObs, RM3_VaR_historical_left, RM3_VaR_historical_right, RM3_VaR_normal_left, RM3_VaR_normal_right,
                            RM4_StartDate, RM4_EndDate, RM4_NObs, RM4_ES_historical_left, RM4_ES_historical_right, RM4_ES_normal_left, RM4_ES_normal_right,
                            RM5_StartDate, RM5_EndDate, RM5_NObs, RM5_SV_gpd_left, RM5_SV_gpd_right)
    risk_measures <- rbind(risk_measures,new_entry)

} # End of the loop 'i in 1:nrow(Yahoo_Finance_Tickers' on the assets (Yahoo_Finance_Tickers)
 
# Add the results of risk measures to the output file
# CFL_2022_11_28: I don't understand the following line of code / myDF.csv?
write.table(risk_measures, file_output2, sep = ",", append = T,
            col.names = !file.exists("myDF.csv"),row.names = F)
write("", file=file_output2, append = TRUE) 


# *** Information for the execution file ***

# *** Compute the duration of the program

# Storing of the ending time of the program
ending_time = Sys.time()

a <- paste("Ending time of the program:", ending_time)
write(a, file=file_output1, append = TRUE)
write("", file=file_output1, append = TRUE)

# Computation of the duration of the program
diff_time = ending_time - starting_time
a <- paste("Duration of the program:", format(diff_time, digits=2, nsmall=2))
write(a, file=file_output1, append = TRUE)
write("", file=file_output1, append = TRUE)


##########################
### End of the program ###
##########################


# References :
#   Yahoo Finance: https://finance.yahoo.com 
#   https://fr.finance.yahoo.com/quote/%5EGSPC?p=^GSPC
#   https://www.rdocumentation.org/packages/quantmod/versions/0.4.20/topics/getSymbols.yahoo