As an ordered list of actions:
- Load data - Yahoo financials for each day for 5 years, taking only date and closing price for this analysis
- Transform sources: Merge sources, change column headings, cast the Date column as DATE type, sort descending
- Perform logistic regression
- Create a frame of actual versus predicted changes, and add a column for the correct/incorrect prediction result
- Find percent correct, on whether the price moved correctly up or down
Source data is here.
# Clear memory
rm(list = ls())
# Set working directory
setwd("../Data")
getwd()
# load data
# Data is Yahoo financial price for each day for 5 years,
# taking only date and closing price for this analysis
GOOG.data <- read.csv("GOOG.csv", header = TRUE, sep = ",")[, c("Date", "Adj.Close")]
SPY.data <- read.csv("SPY.csv", header = TRUE, sep = ",")[, c("Date", "Adj.Close")]
# merge sources
GOOG.merged <- merge(GOOG.data, SPY.data, by = "Date")
# change column headings for Google and for SPY
names(GOOG.merged)[2] <- "GOOG.Close"
names(GOOG.merged)[3] <- "SPY.Close"
# cast as date
# in some data sets, the date is formatted as MM/DD/YYYY
GOOG.merged$Date <- as.Date(GOOG.merged$Date)
# sort descending
GOOG.merged <- GOOG.merged[order(GOOG.merged$Date, decreasing = TRUE),]
# calculate returns from previous row
GOOG.returns <- GOOG.merged
GOOG.returns[-nrow(GOOG.merged), -1] <- GOOG.merged[-nrow(GOOG.merged), -1] / GOOG.merged[-1, -1] - 1
# change column headings for returns
names(GOOG.returns)[2] <- "GOOG.Returns"
names(GOOG.returns)[3] <- "SPY.Returns"
# remove last row
GOOG.returns <- GOOG.returns[-nrow(GOOG.returns),]
# correlation of prices
(corr.Prices <- cor.test(GOOG.merged$GOOG.Close, GOOG.merged$SPY.Close))
# correlation of returns
(corr.Return <- cor.test(GOOG.returns$GOOG.Returns, GOOG.returns$SPY.Returns))
# combine prices and returns using lagged data
GOOG.lagging <- data.frame(GOOG.returns[-nrow(GOOG.returns),], GOOG.returns[-1, -1])
# rename the lagged columns
names(GOOG.lagging)[4:5] <- c("GOOG.Returns.Lagged", "SPY.Returns.Lagged")
# create a column for logistic regression
GOOG.lagging$Up = GOOG.lagging$GOOG.Returns >= 0
# perform logistic regression
GOOG.logRegression <- glm(GOOG.lagging$Up ~ GOOG.lagging$GOOG.Returns.Lagged + GOOG.lagging$SPY.Returns.Lagged, fam = binomial)
summary(GOOG.logRegression)
# create a frame of actual versus predicted
GOOG.fitted <- data.frame(GOOG.lagging$Up, fitted(GOOG.logRegression) >= 0.5)
names(GOOG.fitted) <- c("Actual", "Predicted")
# create column for correct/incorrect prediction
GOOG.fitted$CorrectForecast = GOOG.fitted$Actual == GOOG.fitted$Predicted
# find percent correct
length(GOOG.fitted$CorrectForecast[GOOG.fitted$CorrectForecast == TRUE]) / length(GOOG.fitted$CorrectForecast)
0 Comments