r-multiple regression analysis

Prepared by VOLKAN OBAN

Multiple Regression Analysis in R

> library(car)> library(corrplot) # We'll use corrplot later on in this example too.> library(visreg) # This library will allow us to show multivariate graphs> library(rgl)> library(knitr)> library(scatterplot3d)> head(Prestige,5) education income women prestige census typegov.administrators 13.11 12351 11.16 68.8 1113 profgeneral.managers 12.26 25879 4.02 69.1 1130 profaccountants 12.77 9271 15.70 63.4 1171 profpurchasing.officers 11.42 8865 9.11 56.8 1175 profchemists 14.62 8403 11.68 73.5 2111 prof

> newdata = Prestige[,c(1:4)]> summary(newdata) education income women prestige Min. : 6.380 Min. : 611 Min. : 0.000 Min. :14.80 1st Qu.: 8.445 1st Qu.: 4106 1st Qu.: 3.592 1st Qu.:35.23 Median :10.540 Median : 5930 Median :13.600 Median :43.60 Mean :10.738 Mean : 6798 Mean :28.979 Mean :46.83 3rd Qu.:12.648 3rd Qu.: 8187 3rd Qu.:52.203 3rd Qu.:59.27 Max. :15.970 Max. :25879 Max. :97.510 Max. :87.20

> plot(newdata, pch=16, col="pink", main="Matrix Scatterplot of Income, Education, Women and Prestige")

> set.seed(1)> > # Center predictors.> education.c = scale(newdata$education, center=TRUE, scale=FALSE)> prestige.c = scale(newdata$prestige, center=TRUE, scale=FALSE)> women.c = scale(newdata$women, center=TRUE, scale=FALSE)> > # bind these new variables into newdata and display a summary.> new.c.vars = cbind(education.c, prestige.c, women.c)> newdata = cbind(newdata, new.c.vars)> names(newdata)[5:7] = c("education.c", "prestige.c", "women.c" )> summary(newdata) education income women prestige Min. : 6.380 Min. : 611 Min. : 0.000 Min. :14.80 1st Qu.: 8.445 1st Qu.: 4106 1st Qu.: 3.592 1st Qu.:35.23 Median :10.540 Median : 5930 Median :13.600 Median :43.60 Mean :10.738 Mean : 6798 Mean :28.979 Mean :46.83 3rd Qu.:12.648 3rd Qu.: 8187 3rd Qu.:52.203 3rd Qu.:59.27 Max. :15.970 Max. :25879 Max. :97.510 Max. :87.20 education.c prestige.c women.c Min. :-4.358 Min. :-32.033 Min. :-28.98 1st Qu.:-2.293 1st Qu.:-11.608 1st Qu.:-25.39 Median :-0.198 Median : -3.233 Median :-15.38 Mean : 0.000 Mean : 0.000 Mean : 0.00 3rd Qu.: 1.909 3rd Qu.: 12.442 3rd Qu.: 23.22 Max. : 5.232 Max. : 40.367 Max. : 68.53 > mod1 = lm(income ~ education.c + prestige.c + women.c, data=newdata)> summary(mod1)

Call:lm(formula = income ~ education.c + prestige.c + women.c, data = newdata)

Residuals: Min 1Q Median 3Q Max -7715.3 -929.7 -231.2 689.7 14391.8

Coefficients:

Estimate Std. Error t value Pr(>|t|) (Intercept) 6797.902 254.934 26.665 < 2e-16 ***education.c 177.199 187.632 0.944 0.347 prestige.c 141.435 29.910 4.729 7.58e-06 ***women.c -50.896 8.556 -5.948 4.19e-08 ***---Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2575 on 98 degrees of freedomMultiple R-squared: 0.6432, Adjusted R-squared: 0.6323 F-statistic: 58.89 on 3 and 98 DF, p-value: < 2.2e-16

> newdatacor = cor(newdata[1:4])> corrplot(newdatacor, method = "number",main="\n by VOLKAN OBAN")

> mod2 = lm(income ~ prestige.c + women.c, data=newdata)> summary(mod2+ )

Call:lm(formula = income ~ prestige.c + women.c, data = newdata)

Residuals: Min 1Q Median 3Q Max -7620.9 -1008.7 -240.4 873.1 14180.0

Coefficients:

Estimate Std. Error t value Pr(>|t|) (Intercept) 6797.902 254.795 26.680 < 2e-16 ***prestige.c 165.875 14.988 11.067 < 2e-16 ***women.c -48.385 8.128 -5.953 4.02e-08 ***---Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2573 on 99 degrees of freedomMultiple R-squared: 0.64, Adjusted R-squared: 0.6327 F-statistic: 87.98 on 2 and 99 DF, p-value: < 2.2e-16> plot(mod2,main="by VOLKAN OBAN", pch=16, which=1)

> newdat <- expand.grid(prestige.c=seq(-35,45,by=5),women.c=seq(-25,70,by=5))> newdat$pp <- predict(mod2,newdata=newdat)> with(newdata,plot3d(prestige.c,women.c,income, col="blue", size=1, type="s", main="3D Linear Model Fit"))> with(newdat,surface3d(unique(prestige.c),unique(women.c),pp,+ alpha=0.3,front="line", back="line"))> mod3 = lm(log(income) ~ prestige.c + I(prestige.c^2) + women.c + I(women.c^2) , data=newdata)> summary(mod3)

Call:lm(formula = log(income) ~ prestige.c + I(prestige.c^2) + women.c + I(women.c^2), data = newdata)

Residuals: Min 1Q Median 3Q Max -1.01614 -0.10973 0.00966 0.14479 0.80844

Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 8.809e+00 5.944e-02 148.188 < 2e-16 ***prestige.c 2.518e-02 1.787e-03 14.096 < 2e-16 ***I(prestige.c^2) -2.605e-04 9.396e-05 -2.773 0.00666 ** women.c -6.306e-03 1.476e-03 -4.271 4.53e-05 ***I(women.c^2) -7.194e-05 4.014e-05 -1.792 0.07620 . ---Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.293 on 97 degrees of freedomMultiple R-squared: 0.7643, Adjusted R-squared: 0.7546 F-statistic: 78.64 on 4 and 97 DF, p-value: < 2.2e-16

> plot(mod3, pch=16, which=1)

> newdat2 <- expand.grid(prestige.c=seq(-35,45,by=5),women.c=seq(-25,70,by=5))> newdat2$pp <- predict(mod3,newdata=newdat2)> with(newdata,plot3d(prestige.c,women.c,log(income), col="blue", size=1, type="s", main="3D Quadratic Model Fit with Log of Income"))> with(newdat2,surface3d(unique(prestige.c),unique(women.c),pp,+ alpha=0.3,front="line", back="line"))

Ref: Felipe Rego

https://rstudio-pubs-static.s3.amazonaws.com/65641_88a692252c6c4f2ab279d115e59e6767.html

r-multiple regression analysis

Data & Analytics