1. The longley data set. > library(MASS) > data(longley) > head(longley) GNP.deflator
GNP Unemployed Armed.Forces Population Year Employed
1947
83.0 234.289
235.6
159.0
107.608 1947 60.323
1948
88.5 259.426
232.5
145.6
108.632 1948 61.122
1949
88.2 258.054
368.2
161.6
109.773 1949 60.171
1950
89.5 284.599
335.1
165.0
110.929 1950 61.187
#Generation of the test and training sets > nn<-nrow(longley) > index<-1:nrow(longley) > testindex<-sample(index,trunc(length(index)/3)) > testindex [1] 5 10 1 15 4 > testset<-longley[testindex,] > trainset<-longley[-testindex,] #Exclude the dependent var >trainset1<- trainset[,-7]
1(a). Ridge regression using the lm.ridge() function # Find the best lambda parameter > select(lm.ridge(trainset$Employed~.,data=trainset1,lambda=seq(0,10,0.001)) +) modified HKB estimator is 4.074423e-05 modified L-W estimator is 0.0004975858 smallest value of GCV at 0
> ridgereg2<-lm.ridge(trainset$Employed ~ ., data = trainset1, lambda = 0) #Print the coefficients of the regression so that we can see the offset (the first coefficient). #We will then offset our predicted values with it. > lm.ridge(trainset$Employed ~ ., data = trainset1, lambda = 0) GNP.deflator
GNP
Unemployed Armed.Forces
Population
Year
-4.402906e+03 -8.868277e-03 -5.470213e-02 -2.260566e-02 -1.167793e-02 -4.800817e-02 2.305525e+00 > pred.value<-scale(testset[,1:6],center = F, scale = ridgereg2$scales)%*% ridgereg2$coef > sumh=0 > tt<-nrow(testset) #Let’s have a look at our predicted values. > head(pred.value) 1951 4465.487 1956 4470.334 1947 4462.957 1961 4471.758 1950 4464.585 #Below we add the offset to our predicted values. sumh=0 > for(i in 1:tt){ sumh=sumh+(pred.value[i]-4402.90-testset[i,7])^2 } > sumg<-sumh/tt > sumg [1] 0.2245632
1. (b) Ridge Regression using the linearRidge() function.
ridge1_reg<-linearRidge(formula=trainset$Employed~., data=trainset1,lambda="automatic") > nn<-predict(ridge1_reg,testset[,1:6]) > sumh=0 > for(i in 1:tt) { sumh=sumh+(nn[i]-testset[i,7])^2 } sumg<-sumh/tt >sumg 0.3804 NOTE: The test error for the case of linear regression is 0.227 , and so the lm.ridge() produces slightly better results.
2. The prostate data set. > library(ElemStatLearn) > data(prostate) > head(prostate) lcavol
lweight age
lbph svi
lcp gleason pgg45
lpsa train
1 -0.5798185 2.769459
50 -1.386294
0 -1.386294
6
0 -0.4307829
TRUE
2 -0.9942523 3.319626
58 -1.386294
0 -1.386294
6
0 -0.1625189
TRUE
3 -0.5108256 2.691243
74 -1.386294
0 -1.386294
7
20 -0.1625189
TRUE
4 -1.2039728 3.282789
58 -1.386294
0 -1.386294
6
0 -0.1625189
TRUE
> trainset<-prostate[prostate$train==TRUE,] > testset<-prostate[prostate$train==FALSE,] #Exclude the logic column > trainset1<-trainset[,-10] #Exclude the dependent var > trainset2<-trainset1[,-9] #The trainset contains 512 cases, and the testset contains 256 cases. The dependent variable is lpsa.
2(a). Ridge regression using the lm.ridge() function > select(lm.ridge(trainset$lpsa~.,data=trainset2,lambda=seq(0,10,0.001)) +) modified HKB estimator is 3.355691 modified L-W estimator is 3.050708 smallest value of GCV
at 4.922
> ridgereg2<-lm.ridge(trainset$lpsa ~ ., data = trainset2, lambda = 4.922) > lm.ridge(trainset$lpsa ~ ., data = trainset2, lambda = 4.922) lcavol
lweight
age
lbph
svi
lcp 0.095859807 -0.116496939
0.492493357
gleason
pgg45
0.017243225
0.007074211
0.601029107 -0.014805753
0.137989392
0.679417525
> pred.value<-scale(testset[,1:8],center = F, scale = ridgereg2$scales)%*% ridgereg2$coef > sumh=0 > tt<-nrow(testset)
sumh=0 #Note how we add the offset (first coefficient above) to the predicted values. > for(i in 1:tt) { + sumh=sumh+(pred.value[i]+0.0958-testset[i,9])^2 +} > sumg<-sumh/tt > sumg [1] 0.4943575
2 (b) Ridge Regression using the linearRidge() function.
>library(ridge) > ridge1_reg<-linearRidge(formula=trainset$lpsa~., data=trainset2,lambda="automatic") > nn<-predict(ridge1_reg,testset[,1:8]) > sumh=0 > for(i in 1:tt) { + sumh=sumh+(nn[i]-testset[i,9])^2 +} > sumg<-sumh/tt > sumg 7 0.4895868 NOTE: The linear regression test error is 0.5212 and so ridge regression does better in terms of test error.