추정치 구하기
추정치 구하기
-
predict() 사용
야구 데이터 추정하기
-
홈런(HR)에 대한 루타(TB) 회귀분석 하기
# 데이터 로드
df_kbo <- read.csv('r-ggagi-data/example_kbo2015.csv')
head(df_kbo, 3)
ranking | team | AVG | G | PA | AB | R | H | X2B | X3B | ⋯ | IBB | HBP | SO | GDP | SLG | OBP | OPS | MH | RISP | PH.BA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <chr> | <dbl> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | ⋯ | <int> | <int> | <int> | <int> | <dbl> | <dbl> | <dbl> | <int> | <dbl> | <dbl> | |
1 | 1 | 삼성 | 0.300 | 102 | 4091 | 3553 | 634 | 1066 | 188 | 19 | ⋯ | 15 | 47 | 650 | 78 | 0.471 | 0.374 | 0.845 | 101 | 0.300 | 0.216 |
2 | 2 | 넥센 | 0.300 | 102 | 4151 | 3620 | 658 | 1085 | 227 | 12 | ⋯ | 13 | 61 | 815 | 79 | 0.498 | 0.374 | 0.872 | 100 | 0.294 | 0.268 |
3 | 3 | 두산 | 0.291 | 99 | 3950 | 3410 | 570 | 991 | 176 | 16 | ⋯ | 17 | 66 | 544 | 95 | 0.438 | 0.368 | 0.806 | 98 | 0.284 | 0.262 |
# 회귀모델 구하기
Lm <- lm(HR~TB, data = df_kbo)
Lm # y = -109.2696 + 0.1441x (1.44개의 홈런 / 10개의 안타)
Call:
lm(formula = HR ~ TB, data = df_kbo)
Coefficients:
(Intercept) TB
-109.2696 0.1441
# 회귀모델 검증
# Multiple R-squared : 0.9039
# p-value : 2.423e-05
# Pr(>|t|) 2.42e-05
summary(Lm)
Call:
lm(formula = HR ~ TB, data = df_kbo)
Residuals:
Min 1Q Median 3Q Max
-7.991 -4.702 -3.498 4.471 11.884
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -109.26964 24.92619 -4.384 0.00234 **
TB 0.14411 0.01661 8.677 2.42e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.963 on 8 degrees of freedom
Multiple R-squared: 0.9039, Adjusted R-squared: 0.8919
F-statistic: 75.28 on 1 and 8 DF, p-value: 2.423e-05
# 회귀모델에 값을 넣어 추정치 구하기
p <- predict(Lm)
p
- 1
- 131.825268238313
- 2
- 150.2712623156
- 3
- 105.885589067127
- 4
- 117.990772680347
- 5
- 118.278991337805
- 6
- 88.3042509622124
- 7
- 85.7102830450938
- 8
- 91.3305468655174
- 9
- 86.286720360009
- 10
- 83.1163151279752
# 두 값 비교하기
c(df_kbo$HR[1], p[1])
c(df_kbo$HR[2], p[2])
- 1
- 127
- 1
- 131.825268238313
- 1
- 155
- 2
- 150.2712623156
# 모든 값 비교하기
Com <- data.frame(team = df_kbo$team, HR = df_kbo$HR, fittedHR = round(p), fittedHR = p)
Com
team | HR | fittedHR | fittedHR.1 | |
---|---|---|---|---|
<chr> | <int> | <dbl> | <dbl> | |
1 | 삼성 | 127 | 132 | 131.82527 |
2 | 넥센 | 155 | 150 | 150.27126 |
3 | 두산 | 98 | 106 | 105.88559 |
4 | NC | 110 | 118 | 117.99077 |
5 | 롯데 | 130 | 118 | 118.27899 |
6 | SK | 92 | 88 | 88.30425 |
7 | 한화 | 82 | 86 | 85.71028 |
8 | kt | 87 | 91 | 91.33055 |
9 | LG | 83 | 86 | 86.28672 |
10 | KIA | 95 | 83 | 83.11632 |
루타가 1600일 때, 홈런의 갯수는?
# 같은 변수명 data.frame 으로 만들어야 함
newTeam <- data.frame(TB = 1600)
predict(Lm, newTeam)
1: 121.30528724111
# 삼성
df_kbo[1,]$TB
1673
mtcars 데이터로 예측하기
-
연비(mpg) 종속변수(y), 차체 무게를 독립변수(x)로 회귀모델 구하기
df <- mtcars
head(df, 3)
mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
Mazda RX4 | 21.0 | 6 | 160 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
Mazda RX4 Wag | 21.0 | 6 | 160 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
Datsun 710 | 22.8 | 4 | 108 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
str(df)
'data.frame': 32 obs. of 11 variables:
$ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
$ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
$ disp: num 160 160 108 258 360 ...
$ hp : num 110 110 93 110 175 105 245 62 95 123 ...
$ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
$ wt : num 2.62 2.88 2.32 3.21 3.44 ...
$ qsec: num 16.5 17 18.6 19.4 17 ...
$ vs : num 0 0 1 1 0 1 0 1 1 1 ...
$ am : num 1 1 1 0 0 0 0 0 0 0 ...
$ gear: num 4 4 4 3 3 3 3 4 4 4 ...
$ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# 회귀모델 구하기
Lm <- lm(mpg~wt, data=df)
Lm # 1ton 늘때마다 연비 5만큼 낮아짐
Call:
lm(formula = mpg ~ wt, data = df)
Coefficients:
(Intercept) wt
37.285 -5.344
# 회귀모델 검증
# R-squared : 0.7528
# p-value : 0.0000000001294 (x값이 0이 될 확률)
options(scipen = 999)
summary(Lm)
Call:
lm(formula = mpg ~ wt, data = df)
Residuals:
Min 1Q Median 3Q Max
-4.5432 -2.3647 -0.1252 1.4096 6.8727
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 37.2851 1.8776 19.858 < 0.0000000000000002 ***
wt -5.3445 0.5591 -9.559 0.000000000129 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.046 on 30 degrees of freedom
Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
F-statistic: 91.38 on 1 and 30 DF, p-value: 0.0000000001294
# 추정치 구하기
p <- predict(Lm)
p
- Mazda RX4
- 23.2826106468086
- Mazda RX4 Wag
- 21.9197703957643
- Datsun 710
- 24.8859521186254
- Hornet 4 Drive
- 20.1026500610386
- Hornet Sportabout
- 18.900143957176
- Valiant
- 18.7932545257216
- Duster 360
- 18.2053626527221
- Merc 240D
- 20.2362618503567
- Merc 230
- 20.4500407132656
- Merc 280
- 18.900143957176
- Merc 280C
- 18.900143957176
- Merc 450SE
- 15.5331268663607
- Merc 450SL
- 17.3502472010864
- Merc 450SLC
- 17.0830236224503
- Cadillac Fleetwood
- 9.22665041054798
- Lincoln Continental
- 8.29671235689423
- Chrysler Imperial
- 8.71892561113932
- Fiat 128
- 25.5272887073521
- Honda Civic
- 28.6538045773949
- Toyota Corolla
- 27.4780208313959
- Toyota Corona
- 24.1110037405806
- Dodge Challenger
- 18.4725862313582
- AMC Javelin
- 18.9268663150396
- Camaro Z28
- 16.762355328087
- Pontiac Firebird
- 16.7356329702233
- Fiat X1-9
- 26.9435736741236
- Porsche 914-2
- 25.8479570017155
- Lotus Europa
- 29.1989406778126
- Ford Pantera L
- 20.3431512818111
- Ferrari Dino
- 22.4809399109002
- Maserati Bora
- 18.2053626527221
- Volvo 142E
- 22.427495195173
# 두 값 비교
data.frame(mpg=df$mpg, fittedMPG=p)
mpg | fittedMPG | |
---|---|---|
<dbl> | <dbl> | |
Mazda RX4 | 21.0 | 23.282611 |
Mazda RX4 Wag | 21.0 | 21.919770 |
Datsun 710 | 22.8 | 24.885952 |
Hornet 4 Drive | 21.4 | 20.102650 |
Hornet Sportabout | 18.7 | 18.900144 |
Valiant | 18.1 | 18.793255 |
Duster 360 | 14.3 | 18.205363 |
Merc 240D | 24.4 | 20.236262 |
Merc 230 | 22.8 | 20.450041 |
Merc 280 | 19.2 | 18.900144 |
Merc 280C | 17.8 | 18.900144 |
Merc 450SE | 16.4 | 15.533127 |
Merc 450SL | 17.3 | 17.350247 |
Merc 450SLC | 15.2 | 17.083024 |
Cadillac Fleetwood | 10.4 | 9.226650 |
Lincoln Continental | 10.4 | 8.296712 |
Chrysler Imperial | 14.7 | 8.718926 |
Fiat 128 | 32.4 | 25.527289 |
Honda Civic | 30.4 | 28.653805 |
Toyota Corolla | 33.9 | 27.478021 |
Toyota Corona | 21.5 | 24.111004 |
Dodge Challenger | 15.5 | 18.472586 |
AMC Javelin | 15.2 | 18.926866 |
Camaro Z28 | 13.3 | 16.762355 |
Pontiac Firebird | 19.2 | 16.735633 |
Fiat X1-9 | 27.3 | 26.943574 |
Porsche 914-2 | 26.0 | 25.847957 |
Lotus Europa | 30.4 | 29.198941 |
Ford Pantera L | 15.8 | 20.343151 |
Ferrari Dino | 19.7 | 22.480940 |
Maserati Bora | 15.0 | 18.205363 |
Volvo 142E | 21.4 | 22.427495 |
# 차가 6톤일 때 연비 추정
NewCar <- data.frame(wt=6)
predict(Lm, newdata = NewCar)
1: 5.21829673100597
# 차가 400Kg일 때 연비 추정
NewCar <- data.frame(wt=0.4)
predict(Lm, newdata = NewCar)
1: 35.147337538253
diamonds 데이터로 캐럿에 따른 가격 예측하기
library(ggplot2)
Warning message:
"package 'ggplot2' was built under R version 4.0.2"
df <- diamonds
head(df, 3)
carat | cut | color | clarity | depth | table | price | x | y | z |
---|---|---|---|---|---|---|---|---|---|
<dbl> | <ord> | <ord> | <ord> | <dbl> | <dbl> | <int> | <dbl> | <dbl> | <dbl> |
0.23 | Ideal | E | SI2 | 61.5 | 55 | 326 | 3.95 | 3.98 | 2.43 |
0.21 | Premium | E | SI1 | 59.8 | 61 | 326 | 3.89 | 3.84 | 2.31 |
0.23 | Good | E | VS1 | 56.9 | 65 | 327 | 4.05 | 4.07 | 2.31 |
str(df)
tibble [53,940 x 10] (S3: tbl_df/tbl/data.frame)
$ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
$ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
$ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
$ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
$ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
$ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
$ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
$ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
$ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
$ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# 회귀모델 구하기 - 캐럿에 따른 가격
Lm <- lm(price~carat, data = df)
Lm # 1캐럿이 올라가면 7756달러 올라감
Call:
lm(formula = price ~ carat, data = df)
Coefficients:
(Intercept) carat
-2256 7756
# 회귀 모델 검증
# Multiple R-squared : 0.8493
# p-value : < 0.00000000000000022
# Pr(>|t|) 0.00000000000000022 < 0.05 (0 이될 확률 낮음)
options(scipen = 999)
summary(Lm)
Call:
lm(formula = price ~ carat, data = df)
Residuals:
Min 1Q Median 3Q Max
-18585.3 -804.8 -18.9 537.4 12731.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2256.36 13.06 -172.8 <0.0000000000000002 ***
carat 7756.43 14.07 551.4 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1549 on 53938 degrees of freedom
Multiple R-squared: 0.8493, Adjusted R-squared: 0.8493
F-statistic: 3.041e+05 on 1 and 53938 DF, p-value: < 0.00000000000000022
# 10 캐럿 일 때, 가격 예측
NewDiamond <- data.frame(carat=c(10, 20))
predict(Lm, newdata = NewDiamond)
- 1
- 75307.895599639
- 2
- 152872.151779323
예측값 신뢰구간으로 나타내기
-
추정값을 x가 2일때 30이다 => x가 2일때 28~32이다
-
통계학은 항상 점추정보다 구간추정을 더 신뢰
-
예측구간 : 예측값의 신뢰구간
데이터의 신뢰구간을 그래프로 나타내기
library(ggplot2)
df <- diamonds
# 표본 추출
Sample <- diamonds[sample(nrow(df), 100), ]
head(Sample)
carat | cut | color | clarity | depth | table | price | x | y | z |
---|---|---|---|---|---|---|---|---|---|
<dbl> | <ord> | <ord> | <ord> | <dbl> | <dbl> | <int> | <dbl> | <dbl> | <dbl> |
1.14 | Very Good | H | SI2 | 63.3 | 57 | 4615 | 6.60 | 6.63 | 4.19 |
1.23 | Premium | E | SI1 | 59.2 | 62 | 7274 | 7.05 | 7.00 | 4.16 |
0.72 | Ideal | H | VVS1 | 60.8 | 58 | 3180 | 5.77 | 5.80 | 3.52 |
1.11 | Ideal | H | VS2 | 64.0 | 58 | 5632 | 6.52 | 6.46 | 4.16 |
0.30 | Good | H | VS1 | 63.4 | 53 | 526 | 4.25 | 4.30 | 2.71 |
0.30 | Ideal | G | VVS2 | 62.4 | 57 | 665 | 4.27 | 4.29 | 2.67 |
# 시각화
g1 <- ggplot(Sample, aes(x = carat, y=price, colour=color)) +
geom_point()
g1
g2 <- ggplot(Sample, aes(x=carat, y=price)) +
geom_point(aes(colour=color))
g2
colour 속성으로 그룹별 회귀모델을 나타낼 수 있음
-
ggplot() 넣으면 그룹별
-
geom_point() 하나만 표시
# color 그룹별 회귀모델
g1 + geom_smooth(method = 'lm')
# color 그룹별 회귀모델
g2 + geom_smooth(method = 'lm')
# 회귀모델 구하기 - 정확한 값으로 예측구간 나타내기
Lm <- lm(price~carat, data=Sample)
Lm
Call:
lm(formula = price ~ carat, data = Sample)
Coefficients:
(Intercept) carat
-1960 7167
# confidence : 평균신뢰구간, prediction 예측구간
p <- predict(Lm, interval = 'confidence', level = 0.95)
head(p)
fit | lwr | upr | |
---|---|---|---|
1 | 6210.3498 | 5953.1074 | 6467.592 |
2 | 6855.3964 | 6575.1028 | 7135.690 |
3 | 3200.1325 | 2979.7940 | 3420.471 |
4 | 5995.3343 | 5744.8967 | 6245.772 |
5 | 189.9152 | -121.7496 | 501.580 |
6 | 189.9152 | -121.7496 | 501.580 |
다중선형 회귀분석에서 변수 선택법
전진선택법(forward selection)
-
기준 통계치를 가장 많이 개선시키는 변수를 차례로 추가하는 방법
후진제거법(backward elimination)
-
기준 통계치에 가장 도움이 되지 않는 변수를 하나씩 제거하는 방법
단계적 방법(stepwise selection)
-
모든 변수가 포함된 모델에서 기준 통계치에 도움되지 않는 변수를 삭제
-
모델에 빠진 변수중 통계치를 개선시키는 변수를 추가하는 방법
-
변수의 추가 또는 삭제를 반복
회귀모델에서 변수를 선택하는 판단 기준
-
AIC(아케이케 통계량) => R step()
-
BIC(슈바르츠 통계량) => leaps 패키지의 redsubset()
-
Cp(멜로우스 통계량)
-
세 통계량이 작을 수록 좋다.
다중공선성(Multicolinearity)
-
모형의 일부 예측변수가 다른 예측변수와 상관되어 있을 때 발생하는 조건
-
중대한 다중공선성은 회귀계수의 분산을 증가시켜 불안정하고 해석하기 어렵게 만들기 때문에 문제
-
R에서는 vif 함수를 이용해 VIF값을 구할 수 있으며, 보통 VIF값이 4가 넘으면 다중공선성이 존재한다고 봄
R에서 확인 하기
단계적 방법(stepwise selection)
df <- mtcars
Lm <- lm(mpg~., data=df)
Lm
Call:
lm(formula = mpg ~ ., data = df)
Coefficients:
(Intercept) cyl disp hp drat wt
12.30337 -0.11144 0.01334 -0.02148 0.78711 -3.71530
qsec vs am gear carb
0.82104 0.31776 2.52023 0.65541 -0.19942
step()
# step() - AIC 값 확인(작은 값) direction : 'both', 'backword', 'forward'
# 각 단계의 + 는 탈락된 변수
df_step <- step(Lm, direction = 'both')
Start: AIC=70.9
mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
Df Sum of Sq RSS AIC
- cyl 1 0.0799 147.57 68.915
- vs 1 0.1601 147.66 68.932
- carb 1 0.4067 147.90 68.986
- gear 1 1.3531 148.85 69.190
- drat 1 1.6270 149.12 69.249
- disp 1 3.9167 151.41 69.736
- hp 1 6.8399 154.33 70.348
- qsec 1 8.8641 156.36 70.765
<none> 147.49 70.898
- am 1 10.5467 158.04 71.108
- wt 1 27.0144 174.51 74.280
Step: AIC=68.92
mpg ~ disp + hp + drat + wt + qsec + vs + am + gear + carb
Df Sum of Sq RSS AIC
- vs 1 0.2685 147.84 66.973
- carb 1 0.5201 148.09 67.028
- gear 1 1.8211 149.40 67.308
- drat 1 1.9826 149.56 67.342
- disp 1 3.9009 151.47 67.750
- hp 1 7.3632 154.94 68.473
<none> 147.57 68.915
- qsec 1 10.0933 157.67 69.032
- am 1 11.8359 159.41 69.384
+ cyl 1 0.0799 147.49 70.898
- wt 1 27.0280 174.60 72.297
Step: AIC=66.97
mpg ~ disp + hp + drat + wt + qsec + am + gear + carb
Df Sum of Sq RSS AIC
- carb 1 0.6855 148.53 65.121
- gear 1 2.1437 149.99 65.434
- drat 1 2.2139 150.06 65.449
- disp 1 3.6467 151.49 65.753
- hp 1 7.1060 154.95 66.475
<none> 147.84 66.973
- am 1 11.5694 159.41 67.384
- qsec 1 15.6830 163.53 68.200
+ vs 1 0.2685 147.57 68.915
+ cyl 1 0.1883 147.66 68.932
- wt 1 27.3799 175.22 70.410
Step: AIC=65.12
mpg ~ disp + hp + drat + wt + qsec + am + gear
Df Sum of Sq RSS AIC
- gear 1 1.565 150.09 63.457
- drat 1 1.932 150.46 63.535
<none> 148.53 65.121
- disp 1 10.110 158.64 65.229
- am 1 12.323 160.85 65.672
- hp 1 14.826 163.35 66.166
+ carb 1 0.685 147.84 66.973
+ vs 1 0.434 148.09 67.028
+ cyl 1 0.414 148.11 67.032
- qsec 1 26.408 174.94 68.358
- wt 1 69.127 217.66 75.350
Step: AIC=63.46
mpg ~ disp + hp + drat + wt + qsec + am
Df Sum of Sq RSS AIC
- drat 1 3.345 153.44 62.162
- disp 1 8.545 158.64 63.229
<none> 150.09 63.457
- hp 1 13.285 163.38 64.171
+ gear 1 1.565 148.53 65.121
+ cyl 1 1.003 149.09 65.242
+ vs 1 0.645 149.45 65.319
+ carb 1 0.107 149.99 65.434
- am 1 20.036 170.13 65.466
- qsec 1 25.574 175.67 66.491
- wt 1 67.572 217.66 73.351
Step: AIC=62.16
mpg ~ disp + hp + wt + qsec + am
Df Sum of Sq RSS AIC
- disp 1 6.629 160.07 61.515
<none> 153.44 62.162
- hp 1 12.572 166.01 62.682
+ drat 1 3.345 150.09 63.457
+ gear 1 2.977 150.46 63.535
+ cyl 1 2.447 150.99 63.648
+ vs 1 1.121 152.32 63.927
+ carb 1 0.011 153.43 64.160
- qsec 1 26.470 179.91 65.255
- am 1 32.198 185.63 66.258
- wt 1 69.043 222.48 72.051
Step: AIC=61.52
mpg ~ hp + wt + qsec + am
Df Sum of Sq RSS AIC
- hp 1 9.219 169.29 61.307
<none> 160.07 61.515
+ disp 1 6.629 153.44 62.162
+ carb 1 3.227 156.84 62.864
+ drat 1 1.428 158.64 63.229
- qsec 1 20.225 180.29 63.323
+ cyl 1 0.249 159.82 63.465
+ vs 1 0.249 159.82 63.466
+ gear 1 0.171 159.90 63.481
- am 1 25.993 186.06 64.331
- wt 1 78.494 238.56 72.284
Step: AIC=61.31
mpg ~ wt + qsec + am
Df Sum of Sq RSS AIC
<none> 169.29 61.307
+ hp 1 9.219 160.07 61.515
+ carb 1 8.036 161.25 61.751
+ disp 1 3.276 166.01 62.682
+ cyl 1 1.501 167.78 63.022
+ drat 1 1.400 167.89 63.042
+ gear 1 0.123 169.16 63.284
+ vs 1 0.000 169.29 63.307
- am 1 26.178 195.46 63.908
- qsec 1 109.034 278.32 75.217
- wt 1 183.347 352.63 82.790
# 최종 회귀모델 확인 wt(차체무게), qesc(드래그 레이스 타임), am(미션종류)
df_step
Call:
lm(formula = mpg ~ wt + qsec + am, data = df)
Coefficients:
(Intercept) wt qsec am
9.618 -3.917 1.226 2.936
leaps 패키지 사용 - regsubsets()
install.packages('leaps')
package 'leaps' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\205\AppData\Local\Temp\Rtmpk3oBnd\downloaded_packages
library(leaps)
Warning message:
"package 'leaps' was built under R version 4.0.2"
df <- mtcars
# regsubsets() - BIC method='exhaustive'
df_leaps <- regsubsets(mpg~., data=df
, nvmax=10, method='exhaustive')
# 결과 확인
# 각 변수 갯수별 변수를 뽑아줌
# 1개의 변수만 쓴다면 => wt
# 2개의 변수만 쓴다면 => cyl, wt
# 3개의 변수만 쓴다면 => wt, qsec(드래그타임, 느린차), am
summary(df_leaps)
Subset selection object
Call: regsubsets.formula(mpg ~ ., data = df, nvmax = 10, method = "exhaustive")
10 Variables (and intercept)
Forced in Forced out
cyl FALSE FALSE
disp FALSE FALSE
hp FALSE FALSE
drat FALSE FALSE
wt FALSE FALSE
qsec FALSE FALSE
vs FALSE FALSE
am FALSE FALSE
gear FALSE FALSE
carb FALSE FALSE
1 subsets of each size up to 10
Selection Algorithm: exhaustive
cyl disp hp drat wt qsec vs am gear carb
1 ( 1 ) " " " " " " " " "*" " " " " " " " " " "
2 ( 1 ) "*" " " " " " " "*" " " " " " " " " " "
3 ( 1 ) " " " " " " " " "*" "*" " " "*" " " " "
4 ( 1 ) " " " " "*" " " "*" "*" " " "*" " " " "
5 ( 1 ) " " "*" "*" " " "*" "*" " " "*" " " " "
6 ( 1 ) " " "*" "*" "*" "*" "*" " " "*" " " " "
7 ( 1 ) " " "*" "*" "*" "*" "*" " " "*" "*" " "
8 ( 1 ) " " "*" "*" "*" "*" "*" " " "*" "*" "*"
9 ( 1 ) " " "*" "*" "*" "*" "*" "*" "*" "*" "*"
10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
# BIC 값 확인으로 사용 모델 선택 - 변수 3개 선택 -46.7732016358928
BestBic <- summary(df_leaps)$bic
BestBic
- -37.7946153185172
- -46.348243336331
- -46.7732016358928
- -45.099468000674
- -42.9871333658026
- -40.2266298608964
- -37.0962985327756
- -33.7785850465411
- -30.3710226982818
- -26.9226107507162
# 그래프로 변수 찾기
Min.val <- which.min(BestBic)
plot(BestBic, type='b')
points(Min.val, BestBic[Min.val], cex=5, col='red', pch=20)
# 최적의 모델 찾기 coef
coef(df_leaps, 3)
- (Intercept)
- 9.61778051456159
- wt
- -3.91650372494249
- qsec
- 1.2258859715837
- am
- 2.93583719188942
전진선택법
# regsubsets() - BIC method='backward'
df_leaps <- regsubsets(mpg~., data=df, nvmax=10, method='forward')
# 결과 확인
# 처음에 선택된 변수는 계속 같이 다님
summary(df_leaps)
Subset selection object
Call: regsubsets.formula(mpg ~ ., data = df, nvmax = 10, method = "forward")
10 Variables (and intercept)
Forced in Forced out
cyl FALSE FALSE
disp FALSE FALSE
hp FALSE FALSE
drat FALSE FALSE
wt FALSE FALSE
qsec FALSE FALSE
vs FALSE FALSE
am FALSE FALSE
gear FALSE FALSE
carb FALSE FALSE
1 subsets of each size up to 10
Selection Algorithm: forward
cyl disp hp drat wt qsec vs am gear carb
1 ( 1 ) " " " " " " " " "*" " " " " " " " " " "
2 ( 1 ) "*" " " " " " " "*" " " " " " " " " " "
3 ( 1 ) "*" " " "*" " " "*" " " " " " " " " " "
4 ( 1 ) "*" " " "*" " " "*" " " " " "*" " " " "
5 ( 1 ) "*" " " "*" " " "*" "*" " " "*" " " " "
6 ( 1 ) "*" "*" "*" " " "*" "*" " " "*" " " " "
7 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" " " " "
8 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " "
9 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" "*"
10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
ForwardBic <- summary(df_leaps)$bic
Min.val <- which.min(ForwardBic)
plot(ForwardBic, type='b')
points(Min.val, BestBic[Min.val], cex=5, col='red', pch=20)
# 최적의 모델 구현
coef(df_leaps, 2)
- (Intercept)
- 39.686261480253
- cyl
- -1.5077949682598
- wt
- -3.19097213898375
후진제거법
# regsubsets() - BIC method='backward'
df_leaps <- regsubsets(mpg~., data=df, nvmax=10, method='backward')
# 결과 확인
# 아래에서 위로 결과 확인
summary(df_leaps)
Subset selection object
Call: regsubsets.formula(mpg ~ ., data = df, nvmax = 10, method = "backward")
10 Variables (and intercept)
Forced in Forced out
cyl FALSE FALSE
disp FALSE FALSE
hp FALSE FALSE
drat FALSE FALSE
wt FALSE FALSE
qsec FALSE FALSE
vs FALSE FALSE
am FALSE FALSE
gear FALSE FALSE
carb FALSE FALSE
1 subsets of each size up to 10
Selection Algorithm: backward
cyl disp hp drat wt qsec vs am gear carb
1 ( 1 ) " " " " " " " " "*" " " " " " " " " " "
2 ( 1 ) " " " " " " " " "*" "*" " " " " " " " "
3 ( 1 ) " " " " " " " " "*" "*" " " "*" " " " "
4 ( 1 ) " " " " "*" " " "*" "*" " " "*" " " " "
5 ( 1 ) " " "*" "*" " " "*" "*" " " "*" " " " "
6 ( 1 ) " " "*" "*" "*" "*" "*" " " "*" " " " "
7 ( 1 ) " " "*" "*" "*" "*" "*" " " "*" "*" " "
8 ( 1 ) " " "*" "*" "*" "*" "*" " " "*" "*" "*"
9 ( 1 ) " " "*" "*" "*" "*" "*" "*" "*" "*" "*"
10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
BackwardBic <- summary(df_leaps)$bic
Min.val <- which.min(BackwardBic)
plot(BackwardBic, type='b')
points(Min.val, BackwardBic[Min.val], cex=5, col='red', pch=20)
# 최적의 모델 구현
coef(df_leaps, 3)
- (Intercept)
- 9.61778051456161
- wt
- -3.91650372494249
- qsec
- 1.2258859715837
- am
- 2.93583719188942
명목형 변수 해석하기
-
문자형 -> 숫자형
-
없다 0, 있다1 바꾸어서 의미를 파악
-
남자 0, 여자 1 y = 2 + 4x => 남자라면 2, 여자면 6이됨
head(mtcars)
mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
---|---|---|---|---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
Mazda RX4 | 21.0 | 6 | 160 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | 4 |
Mazda RX4 Wag | 21.0 | 6 | 160 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | 4 |
Datsun 710 | 22.8 | 4 | 108 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | 1 |
Hornet 4 Drive | 21.4 | 6 | 258 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | 1 |
Hornet Sportabout | 18.7 | 8 | 360 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
Valiant | 18.1 | 6 | 225 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
# am 0 auto, 1 수동 => 수동이면 연비가 올라감 7.245
lm(mpg ~ am, data = mtcars)
Call:
lm(formula = mpg ~ am, data = mtcars)
Coefficients:
(Intercept) am
17.147 7.245
다중 명목형 변수
install.packages('car')
also installing the dependencies 'matrixStats', 'RcppArmadillo', 'zip', 'SparseM', 'MatrixModels', 'conquer', 'sp', 'openxlsx', 'minqa', 'nloptr', 'statmod', 'RcppEigen', 'carData', 'abind', 'pbkrtest', 'quantreg', 'maptools', 'rio', 'lme4'
package 'matrixStats' successfully unpacked and MD5 sums checked
package 'RcppArmadillo' successfully unpacked and MD5 sums checked
package 'zip' successfully unpacked and MD5 sums checked
package 'SparseM' successfully unpacked and MD5 sums checked
package 'MatrixModels' successfully unpacked and MD5 sums checked
package 'conquer' successfully unpacked and MD5 sums checked
package 'sp' successfully unpacked and MD5 sums checked
package 'openxlsx' successfully unpacked and MD5 sums checked
package 'minqa' successfully unpacked and MD5 sums checked
package 'nloptr' successfully unpacked and MD5 sums checked
package 'statmod' successfully unpacked and MD5 sums checked
package 'RcppEigen' successfully unpacked and MD5 sums checked
package 'carData' successfully unpacked and MD5 sums checked
package 'abind' successfully unpacked and MD5 sums checked
package 'pbkrtest' successfully unpacked and MD5 sums checked
package 'quantreg' successfully unpacked and MD5 sums checked
package 'maptools' successfully unpacked and MD5 sums checked
package 'rio' successfully unpacked and MD5 sums checked
package 'lme4' successfully unpacked and MD5 sums checked
package 'car' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\205\AppData\Local\Temp\Rtmpk3oBnd\downloaded_packages
library(car)
Warning message:
"package 'car' was built under R version 4.0.2"
Loading required package: carData
head(Vocab, 3)
year | sex | education | vocabulary | |
---|---|---|---|---|
<dbl> | <fct> | <dbl> | <dbl> | |
19740001 | 1974 | Male | 14 | 9 |
19740002 | 1974 | Male | 16 | 9 |
19740003 | 1974 | Female | 10 | 9 |
str(Vocab)
'data.frame': 30351 obs. of 4 variables:
$ year : num 1974 1974 1974 1974 1974 ...
$ sex : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
$ education : num 14 16 10 10 12 16 17 10 12 11 ...
$ vocabulary: num 9 9 9 5 8 8 9 5 3 5 ...
- attr(*, "na.action")= 'omit' Named int [1:32115] 1 2 3 4 5 6 7 8 9 10 ...
..- attr(*, "names")= chr [1:32115] "19720001" "19720002" "19720003" "19720004" ...
# year 관측연도, sex 성별, education 교육횟수, vocabulary 단어시험점수
summary(Vocab)
year sex education vocabulary
Min. :1974 Female:17148 Min. : 0.00 Min. : 0.000
1st Qu.:1987 Male :13203 1st Qu.:12.00 1st Qu.: 5.000
Median :1994 Median :12.00 Median : 6.000
Mean :1995 Mean :13.03 Mean : 6.004
3rd Qu.:2006 3rd Qu.:15.00 3rd Qu.: 7.000
Max. :2016 Max. :20.00 Max. :10.000
?Vocab
Lm <- lm(Vocab$vocabulary~Vocab$sex+Vocab$education, data = Vocab)
Lm
Call:
lm(formula = Vocab$vocabulary ~ Vocab$sex + Vocab$education,
data = Vocab)
Coefficients:
(Intercept) Vocab$sexMale Vocab$education
1.7366 -0.1687 0.3330
Vocab$sexMale 남자이면 점수 -0.1687 낮아짐
교육 1번에 0.333점 올라감