Update 2: [15 May 2011] The function I wrote here does not work with factors due to the way in which I constructed fitted values for the standard error correction in the second stage.
I recorded a new video tutorial on how to conduct an IV regression. This time I used my new ivregress() command, which has much better syntax than my old one. There are several reasons I needed to do this:
- Relative to my previous post on IV regression, I have added the ability to conduct overidentifying tests. Now, my ivregress() command has all of the basic functionality of Stata's 2sls option to ivregress.
- Relative to my previous video tutorial on IV regression, this video uses my newer command, which as much better syntax. As such, I will use this video tutorial to replace the old one.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ----------------------------------- ## | |
## A homegrown IV regression command ## | |
## that handles the case where there ## | |
## is one endogenous variable and many ## | |
## (or one) instrument. ## | |
## ## | |
## Disadvantage: The command doesn't ## | |
## handle multiple endogenous vars ## | |
## ## | |
## Advantage: testing for relevance, ## | |
## displaying the first stage and ## | |
## conducting overidentification tests ## | |
## are natural and easy. ## | |
## ----------------------------------- ## | |
ivregress = function(second, first, data){ | |
s.names = all.vars(second) | |
f.names = all.vars(first) | |
data.names = names(data) | |
N = length(data[,1]) | |
all.names = c(s.names,f.names) | |
resp = s.names[1] | |
endog = f.names[1] | |
inst = f.names[-1] | |
explan = s.names[-1] | |
exog = explan[explan!=endog] | |
exog.f = paste(exog,collapse="+") | |
inst.f = paste(inst, collapse="+") | |
RHS = paste(exog.f, inst.f, sep="+") | |
first.form = as.formula( paste(endog, "~", RHS)) | |
first.lm = lm(first.form, data) | |
ftest = linearHypothesis(first.lm, inst, rep(0,length(inst))) | |
x.hat = fitted(first.lm) | |
data2 = cbind(data,x.hat) | |
iname = paste(endog,".hat",sep="") | |
names(data2) = c(names(data), iname) | |
data2.names = names(data2) | |
RHS2 = paste(exog.f,iname,sep="+") | |
second.form = as.formula(paste(resp, "~", RHS2)) | |
second.lm = lm(second.form, data2) | |
Xmat = data2[c(exog,endog)] | |
Xmat2 = data2[c(exog,iname)] | |
z = summary(second.lm) | |
X = as.matrix(cbind(1,Xmat)) | |
X2 = as.matrix(cbind(1,Xmat2)) | |
Y = data[resp] | |
fit = X%*%second.lm$coefficients | |
res = Y - fit | |
## Tests for overidentifying restrictions ## | |
data3 = cbind(data2, res) | |
names(data3) = c(names(data2), "res") | |
## J test | |
J.test = as.data.frame(matrix(c(1,2),nrow=1)) | |
names(J.test) = c("J.stat","P[J > J.stat ]") | |
## Sargan's Test | |
S.test = as.data.frame(matrix(c(1,2),nrow=1)) | |
names(S.test) = c("S.stat","P[S > S.stat ]") | |
if(length(inst)>1){ | |
J.form = as.formula(paste("res", "~", RHS)) | |
J.lm = lm(J.form, data3) | |
f.test = linearHypothesis(J.lm,inst, rep(0,length(inst))) | |
J.stat = length(inst)*f.test$F[2] | |
S.stat = N*summary(J.lm)$r.squared | |
} | |
J.test[1,1] = J.stat | |
J.test[1,2] = 1-pchisq(J.stat, length(inst)-1) | |
S.test[1,1] = S.stat | |
S.test[1,2] = 1-pchisq(S.stat, length(inst)-1) | |
xPx = t(X2)%*%X2 | |
xPx.inv = solve(xPx) | |
z$cov.unscaled = xPx.inv | |
z$residuals = res | |
z$sigma = sqrt(mean(res^2)) | |
varcovmat = z$cov.unscaled*z$sigma | |
coef = z$coefficients[,1] | |
IV.SE = z$sigma*sqrt(diag(xPx.inv)) | |
t.iv = coef/IV.SE | |
p.val = 2*(1-pnorm(abs(t.iv))) | |
z$coefficients[,2] = IV.SE | |
z$coefficients[,3] = t.iv | |
z$coefficients[,4] = p.val | |
result = list(summary(first.lm),z,ftest,J.test, S.test) | |
names(result) = c("first", "second", "ftest", "Jtest", "Sargan") | |
print("IV object successfully created. Use sum.iv() on object") | |
print("to learn about your 2SLS Regression") | |
return(invisible(result)) | |
} | |
sum.iv = function(reg.iv, first=FALSE, | |
second=TRUE, ftest=FALSE, overid=FALSE) { | |
x= rep(0,5) | |
if(first==TRUE) x[1] = 1 | |
if(second==TRUE) x[2]= 2 | |
if(ftest==TRUE) x[3]= 3 | |
if(overid==TRUE) x[4:5] = 4:5 | |
print(reg.iv[x]) | |
} |
To get ivregress() and sum.iv() to work, (1) copy the text in the above text box into an R script, (2) run all (or just download and run the R script) and (3) save into your default workspace if you want to use the commands into the future. Then, you're good to go. See my video demonstrating how to use the command here:
Here is the code I used to produce the output in the video.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ------------------------ ## | |
## An Easy IV regression ## | |
## regression command for R ## | |
## ## | |
## Author: Tony Cookson ## | |
## Commands: ivregress() ## | |
## sum.iv() ## | |
## ------------------------ ## | |
## Read Data ## | |
library(foreign) | |
mkt.df = read.dta("C://R//mktshare.dta") | |
## Run IV Regression ## | |
library(car) ## Uses linearHypothesis() from car() | |
## Define the IV object | |
mkt.iv = ivregress(Y~x1+x2+p, p~z1+z2, mkt.df) | |
## Options to summarize the IV object | |
sum.iv(mkt.iv) ## Basic Output | |
sum.iv(mkt.iv, first=T) ## Show first stage | |
##-- like Stata's ", first" | |
sum.iv(mkt.iv, ftest=T) ## F-test for relevance | |
##-- same as Stata's "estat first" | |
sum.iv(mkt.iv, overid=T) ## Test for overidentifying restrictions | |
##-- same as Stata's "estat overid" | |
sum.iv(mkt.iv, overid=T, first=T, ftest=T) |
Finally, here is a link to the synthetic "market share" data I used in the video.
I hope you find this command helpful.
Hey Tony,
ReplyDeleteCan you tell me if this package is geared for your specific data set or have you tried multiple data sets?
Douglas from Michigan
Douglas,
ReplyDeleteI have tested it on my own data set (to make sure it matches Stata's calculations exactly), but I wrote the function to work for any data set where you have one endogenous variable (and possibly many instruments). If you have more than one endogenous variable, my function won't do that (yet... I may add that functionality in my next update).
Tony,
ReplyDeleteGreat video and code. Thanks!
Would it be possible to share the code used to generate the dataset mktshare.dta ?
Also the link to the data (mktshare.dta) seems to have too many redirects. Can you make it available as a CSV file (mktshare.dta.SCV.txt)?
Thanks,
James