ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • 추정치 구하기
    R 2020. 7. 14. 17:31

    추정치 구하기

    • predict() 사용

    야구 데이터 추정하기

    • 홈런(HR)에 대한 루타(TB) 회귀분석 하기

    # 데이터 로드
    df_kbo <- read.csv('r-ggagi-data/example_kbo2015.csv')
    head(df_kbo, 3)
     
      ranking team AVG G PA AB R H X2B X3B IBB HBP SO GDP SLG OBP OPS MH RISP PH.BA
      <int> <chr> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
    1 1 삼성 0.300 102 4091 3553 634 1066 188 19 15 47 650 78 0.471 0.374 0.845 101 0.300 0.216
    2 2 넥센 0.300 102 4151 3620 658 1085 227 12 13 61 815 79 0.498 0.374 0.872 100 0.294 0.268
    3 3 두산 0.291 99 3950 3410 570 991 176 16 17 66 544 95 0.438 0.368 0.806 98 0.284 0.262
    # 회귀모델 구하기
    Lm <- lm(HR~TB, data = df_kbo)
    Lm # y = -109.2696 + 0.1441x (1.44개의 홈런 / 10개의 안타)
    Call:
    lm(formula = HR ~ TB, data = df_kbo)
    
    Coefficients:
    (Intercept)           TB  
      -109.2696       0.1441  
    # 회귀모델 검증
    # Multiple R-squared :  0.9039
    # p-value : 2.423e-05
    # Pr(>|t|) 2.42e-05
    summary(Lm)
    Call:
    lm(formula = HR ~ TB, data = df_kbo)
    
    Residuals:
       Min     1Q Median     3Q    Max 
    -7.991 -4.702 -3.498  4.471 11.884 
    
    Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
    (Intercept) -109.26964   24.92619  -4.384  0.00234 ** 
    TB             0.14411    0.01661   8.677 2.42e-05 ***
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
    Residual standard error: 7.963 on 8 degrees of freedom
    Multiple R-squared:  0.9039,    Adjusted R-squared:  0.8919 
    F-statistic: 75.28 on 1 and 8 DF,  p-value: 2.423e-05
    # 회귀모델에 값을 넣어 추정치 구하기
    p <- predict(Lm)
    p
    1
    131.825268238313
    2
    150.2712623156
    3
    105.885589067127
    4
    117.990772680347
    5
    118.278991337805
    6
    88.3042509622124
    7
    85.7102830450938
    8
    91.3305468655174
    9
    86.286720360009
    10
    83.1163151279752
    # 두 값 비교하기
    c(df_kbo$HR[1], p[1])
    c(df_kbo$HR[2], p[2])
    1
    127
    1
    131.825268238313
    1
    155
    2
    150.2712623156
    # 모든 값 비교하기
    Com <- data.frame(team = df_kbo$team, HR = df_kbo$HR, fittedHR = round(p), fittedHR = p)
    Com
     
      team HR fittedHR fittedHR.1
      <chr> <int> <dbl> <dbl>
    1 삼성 127 132 131.82527
    2 넥센 155 150 150.27126
    3 두산 98 106 105.88559
    4 NC 110 118 117.99077
    5 롯데 130 118 118.27899
    6 SK 92 88 88.30425
    7 한화 82 86 85.71028
    8 kt 87 91 91.33055
    9 LG 83 86 86.28672
    10 KIA 95 83 83.11632

    루타가 1600일 때, 홈런의 갯수는?

    # 같은 변수명 data.frame 으로 만들어야 함
    newTeam <- data.frame(TB = 1600)
    predict(Lm, newTeam)

    1: 121.30528724111

    # 삼성
    df_kbo[1,]$TB

    1673


    mtcars 데이터로 예측하기

    • 연비(mpg) 종속변수(y), 차체 무게를 독립변수(x)로 회귀모델 구하기

    df <- mtcars
    head(df, 3)
     
      mpg cyl disp hp drat wt qsec vs am gear carb
      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
    Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
    Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
    str(df)
    'data.frame':    32 obs. of  11 variables:
     $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
     $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
     $ disp: num  160 160 108 258 360 ...
     $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
     $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
     $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
     $ qsec: num  16.5 17 18.6 19.4 17 ...
     $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
     $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
     $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
     $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
    # 회귀모델 구하기
    Lm <- lm(mpg~wt, data=df)
    Lm # 1ton 늘때마다 연비 5만큼 낮아짐
    Call:
    lm(formula = mpg ~ wt, data = df)
    
    Coefficients:
    (Intercept)           wt  
         37.285       -5.344  
    # 회귀모델 검증
    # R-squared :  0.7528
    # p-value : 0.0000000001294 (x값이 0이 될 확률)
    options(scipen = 999)
    summary(Lm)
    Call:
    lm(formula = mpg ~ wt, data = df)
    
    Residuals:
        Min      1Q  Median      3Q     Max 
    -4.5432 -2.3647 -0.1252  1.4096  6.8727 
    
    Coefficients:
                Estimate Std. Error t value             Pr(>|t|)    
    (Intercept)  37.2851     1.8776  19.858 < 0.0000000000000002 ***
    wt           -5.3445     0.5591  -9.559       0.000000000129 ***
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
    Residual standard error: 3.046 on 30 degrees of freedom
    Multiple R-squared:  0.7528,    Adjusted R-squared:  0.7446 
    F-statistic: 91.38 on 1 and 30 DF,  p-value: 0.0000000001294
    # 추정치 구하기
    p <- predict(Lm)
    p
    Mazda RX4
    23.2826106468086
    Mazda RX4 Wag
    21.9197703957643
    Datsun 710
    24.8859521186254
    Hornet 4 Drive
    20.1026500610386
    Hornet Sportabout
    18.900143957176
    Valiant
    18.7932545257216
    Duster 360
    18.2053626527221
    Merc 240D
    20.2362618503567
    Merc 230
    20.4500407132656
    Merc 280
    18.900143957176
    Merc 280C
    18.900143957176
    Merc 450SE
    15.5331268663607
    Merc 450SL
    17.3502472010864
    Merc 450SLC
    17.0830236224503
    Cadillac Fleetwood
    9.22665041054798
    Lincoln Continental
    8.29671235689423
    Chrysler Imperial
    8.71892561113932
    Fiat 128
    25.5272887073521
    Honda Civic
    28.6538045773949
    Toyota Corolla
    27.4780208313959
    Toyota Corona
    24.1110037405806
    Dodge Challenger
    18.4725862313582
    AMC Javelin
    18.9268663150396
    Camaro Z28
    16.762355328087
    Pontiac Firebird
    16.7356329702233
    Fiat X1-9
    26.9435736741236
    Porsche 914-2
    25.8479570017155
    Lotus Europa
    29.1989406778126
    Ford Pantera L
    20.3431512818111
    Ferrari Dino
    22.4809399109002
    Maserati Bora
    18.2053626527221
    Volvo 142E
    22.427495195173
    # 두 값 비교
    data.frame(mpg=df$mpg, fittedMPG=p)
     
      mpg fittedMPG
      <dbl> <dbl>
    Mazda RX4 21.0 23.282611
    Mazda RX4 Wag 21.0 21.919770
    Datsun 710 22.8 24.885952
    Hornet 4 Drive 21.4 20.102650
    Hornet Sportabout 18.7 18.900144
    Valiant 18.1 18.793255
    Duster 360 14.3 18.205363
    Merc 240D 24.4 20.236262
    Merc 230 22.8 20.450041
    Merc 280 19.2 18.900144
    Merc 280C 17.8 18.900144
    Merc 450SE 16.4 15.533127
    Merc 450SL 17.3 17.350247
    Merc 450SLC 15.2 17.083024
    Cadillac Fleetwood 10.4 9.226650
    Lincoln Continental 10.4 8.296712
    Chrysler Imperial 14.7 8.718926
    Fiat 128 32.4 25.527289
    Honda Civic 30.4 28.653805
    Toyota Corolla 33.9 27.478021
    Toyota Corona 21.5 24.111004
    Dodge Challenger 15.5 18.472586
    AMC Javelin 15.2 18.926866
    Camaro Z28 13.3 16.762355
    Pontiac Firebird 19.2 16.735633
    Fiat X1-9 27.3 26.943574
    Porsche 914-2 26.0 25.847957
    Lotus Europa 30.4 29.198941
    Ford Pantera L 15.8 20.343151
    Ferrari Dino 19.7 22.480940
    Maserati Bora 15.0 18.205363
    Volvo 142E 21.4 22.427495
    # 차가 6톤일 때 연비 추정
    NewCar  <- data.frame(wt=6)
    predict(Lm, newdata = NewCar)

    1: 5.21829673100597

    # 차가 400Kg일 때 연비 추정
    NewCar  <- data.frame(wt=0.4)
    predict(Lm, newdata = NewCar)

    1: 35.147337538253


    diamonds 데이터로 캐럿에 따른 가격 예측하기

    library(ggplot2)
    Warning message:
    "package 'ggplot2' was built under R version 4.0.2"
    df <- diamonds
    head(df, 3)
     
    carat cut color clarity depth table price x y z
    <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
    0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
    0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
    0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
    str(df)
    tibble [53,940 x 10] (S3: tbl_df/tbl/data.frame)
     $ carat  : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
     $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
     $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
     $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
     $ depth  : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
     $ table  : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
     $ price  : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
     $ x      : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
     $ y      : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
     $ z      : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
    # 회귀모델 구하기 - 캐럿에 따른 가격
    Lm <- lm(price~carat, data = df)
    Lm # 1캐럿이 올라가면 7756달러 올라감
    Call:
    lm(formula = price ~ carat, data = df)
    
    Coefficients:
    (Intercept)        carat  
          -2256         7756  
    # 회귀 모델 검증
    # Multiple R-squared :  0.8493
    # p-value : < 0.00000000000000022
    # Pr(>|t|) 0.00000000000000022 < 0.05 (0 이될 확률 낮음)
    options(scipen = 999)
    summary(Lm)
    Call:
    lm(formula = price ~ carat, data = df)
    
    Residuals:
         Min       1Q   Median       3Q      Max 
    -18585.3   -804.8    -18.9    537.4  12731.7 
    
    Coefficients:
                Estimate Std. Error t value            Pr(>|t|)    
    (Intercept) -2256.36      13.06  -172.8 <0.0000000000000002 ***
    carat        7756.43      14.07   551.4 <0.0000000000000002 ***
    ---
    Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    
    Residual standard error: 1549 on 53938 degrees of freedom
    Multiple R-squared:  0.8493,    Adjusted R-squared:  0.8493 
    F-statistic: 3.041e+05 on 1 and 53938 DF,  p-value: < 0.00000000000000022
    # 10 캐럿 일 때, 가격 예측
    NewDiamond <- data.frame(carat=c(10, 20))
    predict(Lm, newdata = NewDiamond)
    1
    75307.895599639
    2
    152872.151779323

    예측값 신뢰구간으로 나타내기

    • 추정값을 x가 2일때 30이다 => x가 2일때 28~32이다

    • 통계학은 항상 점추정보다 구간추정을 더 신뢰

    • 예측구간 : 예측값의 신뢰구간

    데이터의 신뢰구간을 그래프로 나타내기

    library(ggplot2)
    df <- diamonds
    # 표본 추출
    Sample <- diamonds[sample(nrow(df), 100), ]
    head(Sample)
     
    carat cut color clarity depth table price x y z
    <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
    1.14 Very Good H SI2 63.3 57 4615 6.60 6.63 4.19
    1.23 Premium E SI1 59.2 62 7274 7.05 7.00 4.16
    0.72 Ideal H VVS1 60.8 58 3180 5.77 5.80 3.52
    1.11 Ideal H VS2 64.0 58 5632 6.52 6.46 4.16
    0.30 Good H VS1 63.4 53 526 4.25 4.30 2.71
    0.30 Ideal G VVS2 62.4 57 665 4.27 4.29 2.67
    # 시각화
    g1 <- ggplot(Sample, aes(x = carat, y=price, colour=color)) +
    geom_point()
    g1

    g2 <- ggplot(Sample, aes(x=carat, y=price)) +
    geom_point(aes(colour=color))
    g2

    colour 속성으로 그룹별 회귀모델을 나타낼 수 있음

    • ggplot() 넣으면 그룹별

    • geom_point() 하나만 표시

    # color 그룹별 회귀모델
    g1 + geom_smooth(method = 'lm')

    # color 그룹별 회귀모델
    g2 + geom_smooth(method = 'lm')

    # 회귀모델 구하기 - 정확한 값으로 예측구간 나타내기
    Lm <- lm(price~carat, data=Sample)
    Lm
    Call:
    lm(formula = price ~ carat, data = Sample)
    
    Coefficients:
    (Intercept)        carat  
          -1960         7167  
    # confidence : 평균신뢰구간, prediction 예측구간
    p <- predict(Lm, interval = 'confidence', level = 0.95)
    head(p)
     
      fit lwr upr
    1 6210.3498 5953.1074 6467.592
    2 6855.3964 6575.1028 7135.690
    3 3200.1325 2979.7940 3420.471
    4 5995.3343 5744.8967 6245.772
    5 189.9152 -121.7496 501.580
    6 189.9152 -121.7496 501.580

    다중선형 회귀분석에서 변수 선택법

    전진선택법(forward selection)

    • 기준 통계치를 가장 많이 개선시키는 변수를 차례로 추가하는 방법

    후진제거법(backward elimination)

    • 기준 통계치에 가장 도움이 되지 않는 변수를 하나씩 제거하는 방법

    단계적 방법(stepwise selection)

    • 모든 변수가 포함된 모델에서 기준 통계치에 도움되지 않는 변수를 삭제

    • 모델에 빠진 변수중 통계치를 개선시키는 변수를 추가하는 방법

    • 변수의 추가 또는 삭제를 반복

    회귀모델에서 변수를 선택하는 판단 기준

    • AIC(아케이케 통계량) => R step()

    • BIC(슈바르츠 통계량) => leaps 패키지의 redsubset()

    • Cp(멜로우스 통계량)

    • 세 통계량이 작을 수록 좋다.

    다중공선성(Multicolinearity)

    • 모형의 일부 예측변수가 다른 예측변수와 상관되어 있을 때 발생하는 조건

    • 중대한 다중공선성은 회귀계수의 분산을 증가시켜 불안정하고 해석하기 어렵게 만들기 때문에 문제

    • R에서는 vif 함수를 이용해 VIF값을 구할 수 있으며, 보통 VIF값이 4가 넘으면 다중공선성이 존재한다고 봄

    R에서 확인 하기

    단계적 방법(stepwise selection)

    df <- mtcars
    Lm <- lm(mpg~., data=df)
    Lm
    Call:
    lm(formula = mpg ~ ., data = df)
    
    Coefficients:
    (Intercept)          cyl         disp           hp         drat           wt  
       12.30337     -0.11144      0.01334     -0.02148      0.78711     -3.71530  
           qsec           vs           am         gear         carb  
        0.82104      0.31776      2.52023      0.65541     -0.19942  

    step()

    # step() - AIC 값 확인(작은 값) direction : 'both', 'backword', 'forward'
    # 각 단계의 + 는 탈락된 변수
    df_step <- step(Lm, direction = 'both')
    Start:  AIC=70.9
    mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
    
           Df Sum of Sq    RSS    AIC
    - cyl   1    0.0799 147.57 68.915
    - vs    1    0.1601 147.66 68.932
    - carb  1    0.4067 147.90 68.986
    - gear  1    1.3531 148.85 69.190
    - drat  1    1.6270 149.12 69.249
    - disp  1    3.9167 151.41 69.736
    - hp    1    6.8399 154.33 70.348
    - qsec  1    8.8641 156.36 70.765
    <none>              147.49 70.898
    - am    1   10.5467 158.04 71.108
    - wt    1   27.0144 174.51 74.280
    
    Step:  AIC=68.92
    mpg ~ disp + hp + drat + wt + qsec + vs + am + gear + carb
    
           Df Sum of Sq    RSS    AIC
    - vs    1    0.2685 147.84 66.973
    - carb  1    0.5201 148.09 67.028
    - gear  1    1.8211 149.40 67.308
    - drat  1    1.9826 149.56 67.342
    - disp  1    3.9009 151.47 67.750
    - hp    1    7.3632 154.94 68.473
    <none>              147.57 68.915
    - qsec  1   10.0933 157.67 69.032
    - am    1   11.8359 159.41 69.384
    + cyl   1    0.0799 147.49 70.898
    - wt    1   27.0280 174.60 72.297
    
    Step:  AIC=66.97
    mpg ~ disp + hp + drat + wt + qsec + am + gear + carb
    
           Df Sum of Sq    RSS    AIC
    - carb  1    0.6855 148.53 65.121
    - gear  1    2.1437 149.99 65.434
    - drat  1    2.2139 150.06 65.449
    - disp  1    3.6467 151.49 65.753
    - hp    1    7.1060 154.95 66.475
    <none>              147.84 66.973
    - am    1   11.5694 159.41 67.384
    - qsec  1   15.6830 163.53 68.200
    + vs    1    0.2685 147.57 68.915
    + cyl   1    0.1883 147.66 68.932
    - wt    1   27.3799 175.22 70.410
    
    Step:  AIC=65.12
    mpg ~ disp + hp + drat + wt + qsec + am + gear
    
           Df Sum of Sq    RSS    AIC
    - gear  1     1.565 150.09 63.457
    - drat  1     1.932 150.46 63.535
    <none>              148.53 65.121
    - disp  1    10.110 158.64 65.229
    - am    1    12.323 160.85 65.672
    - hp    1    14.826 163.35 66.166
    + carb  1     0.685 147.84 66.973
    + vs    1     0.434 148.09 67.028
    + cyl   1     0.414 148.11 67.032
    - qsec  1    26.408 174.94 68.358
    - wt    1    69.127 217.66 75.350
    
    Step:  AIC=63.46
    mpg ~ disp + hp + drat + wt + qsec + am
    
           Df Sum of Sq    RSS    AIC
    - drat  1     3.345 153.44 62.162
    - disp  1     8.545 158.64 63.229
    <none>              150.09 63.457
    - hp    1    13.285 163.38 64.171
    + gear  1     1.565 148.53 65.121
    + cyl   1     1.003 149.09 65.242
    + vs    1     0.645 149.45 65.319
    + carb  1     0.107 149.99 65.434
    - am    1    20.036 170.13 65.466
    - qsec  1    25.574 175.67 66.491
    - wt    1    67.572 217.66 73.351
    
    Step:  AIC=62.16
    mpg ~ disp + hp + wt + qsec + am
    
           Df Sum of Sq    RSS    AIC
    - disp  1     6.629 160.07 61.515
    <none>              153.44 62.162
    - hp    1    12.572 166.01 62.682
    + drat  1     3.345 150.09 63.457
    + gear  1     2.977 150.46 63.535
    + cyl   1     2.447 150.99 63.648
    + vs    1     1.121 152.32 63.927
    + carb  1     0.011 153.43 64.160
    - qsec  1    26.470 179.91 65.255
    - am    1    32.198 185.63 66.258
    - wt    1    69.043 222.48 72.051
    
    Step:  AIC=61.52
    mpg ~ hp + wt + qsec + am
    
           Df Sum of Sq    RSS    AIC
    - hp    1     9.219 169.29 61.307
    <none>              160.07 61.515
    + disp  1     6.629 153.44 62.162
    + carb  1     3.227 156.84 62.864
    + drat  1     1.428 158.64 63.229
    - qsec  1    20.225 180.29 63.323
    + cyl   1     0.249 159.82 63.465
    + vs    1     0.249 159.82 63.466
    + gear  1     0.171 159.90 63.481
    - am    1    25.993 186.06 64.331
    - wt    1    78.494 238.56 72.284
    
    Step:  AIC=61.31
    mpg ~ wt + qsec + am
    
           Df Sum of Sq    RSS    AIC
    <none>              169.29 61.307
    + hp    1     9.219 160.07 61.515
    + carb  1     8.036 161.25 61.751
    + disp  1     3.276 166.01 62.682
    + cyl   1     1.501 167.78 63.022
    + drat  1     1.400 167.89 63.042
    + gear  1     0.123 169.16 63.284
    + vs    1     0.000 169.29 63.307
    - am    1    26.178 195.46 63.908
    - qsec  1   109.034 278.32 75.217
    - wt    1   183.347 352.63 82.790
    # 최종 회귀모델 확인 wt(차체무게), qesc(드래그 레이스 타임), am(미션종류)
    df_step
    Call:
    lm(formula = mpg ~ wt + qsec + am, data = df)
    
    Coefficients:
    (Intercept)           wt         qsec           am  
          9.618       -3.917        1.226        2.936  

    leaps 패키지 사용 - regsubsets()

    install.packages('leaps')
    package 'leaps' successfully unpacked and MD5 sums checked
    
    The downloaded binary packages are in
        C:\Users\205\AppData\Local\Temp\Rtmpk3oBnd\downloaded_packages
    library(leaps)
    Warning message:
    "package 'leaps' was built under R version 4.0.2"
    df <- mtcars
    # regsubsets() - BIC method='exhaustive'
    df_leaps <- regsubsets(mpg~., data=df
                           , nvmax=10, method='exhaustive')
    # 결과 확인
    # 각 변수 갯수별 변수를 뽑아줌
    # 1개의 변수만 쓴다면 => wt
    # 2개의 변수만 쓴다면 => cyl, wt
    # 3개의 변수만 쓴다면 => wt, qsec(드래그타임, 느린차), am
    summary(df_leaps)
    Subset selection object
    Call: regsubsets.formula(mpg ~ ., data = df, nvmax = 10, method = "exhaustive")
    10 Variables  (and intercept)
         Forced in Forced out
    cyl      FALSE      FALSE
    disp     FALSE      FALSE
    hp       FALSE      FALSE
    drat     FALSE      FALSE
    wt       FALSE      FALSE
    qsec     FALSE      FALSE
    vs       FALSE      FALSE
    am       FALSE      FALSE
    gear     FALSE      FALSE
    carb     FALSE      FALSE
    1 subsets of each size up to 10
    Selection Algorithm: exhaustive
              cyl disp hp  drat wt  qsec vs  am  gear carb
    1  ( 1 )  " " " "  " " " "  "*" " "  " " " " " "  " " 
    2  ( 1 )  "*" " "  " " " "  "*" " "  " " " " " "  " " 
    3  ( 1 )  " " " "  " " " "  "*" "*"  " " "*" " "  " " 
    4  ( 1 )  " " " "  "*" " "  "*" "*"  " " "*" " "  " " 
    5  ( 1 )  " " "*"  "*" " "  "*" "*"  " " "*" " "  " " 
    6  ( 1 )  " " "*"  "*" "*"  "*" "*"  " " "*" " "  " " 
    7  ( 1 )  " " "*"  "*" "*"  "*" "*"  " " "*" "*"  " " 
    8  ( 1 )  " " "*"  "*" "*"  "*" "*"  " " "*" "*"  "*" 
    9  ( 1 )  " " "*"  "*" "*"  "*" "*"  "*" "*" "*"  "*" 
    10  ( 1 ) "*" "*"  "*" "*"  "*" "*"  "*" "*" "*"  "*" 
    # BIC 값 확인으로 사용 모델 선택 - 변수 3개 선택 -46.7732016358928
    BestBic <- summary(df_leaps)$bic
    BestBic
    1. -37.7946153185172
    2. -46.348243336331
    3. -46.7732016358928
    4. -45.099468000674
    5. -42.9871333658026
    6. -40.2266298608964
    7. -37.0962985327756
    8. -33.7785850465411
    9. -30.3710226982818
    10. -26.9226107507162
    # 그래프로 변수 찾기
    Min.val <- which.min(BestBic)
    plot(BestBic, type='b')
    points(Min.val, BestBic[Min.val], cex=5, col='red', pch=20)

    가장 낮은 값 확인

    # 최적의 모델 찾기 coef
    coef(df_leaps, 3)
    (Intercept)
    9.61778051456159
    wt
    -3.91650372494249
    qsec
    1.2258859715837
    am
    2.93583719188942

     

    전진선택법

    # regsubsets() - BIC method='backward'
    df_leaps <- regsubsets(mpg~., data=df, nvmax=10, method='forward')
    # 결과 확인
    # 처음에 선택된 변수는 계속 같이 다님
    summary(df_leaps)
    Subset selection object
    Call: regsubsets.formula(mpg ~ ., data = df, nvmax = 10, method = "forward")
    10 Variables  (and intercept)
         Forced in Forced out
    cyl      FALSE      FALSE
    disp     FALSE      FALSE
    hp       FALSE      FALSE
    drat     FALSE      FALSE
    wt       FALSE      FALSE
    qsec     FALSE      FALSE
    vs       FALSE      FALSE
    am       FALSE      FALSE
    gear     FALSE      FALSE
    carb     FALSE      FALSE
    1 subsets of each size up to 10
    Selection Algorithm: forward
              cyl disp hp  drat wt  qsec vs  am  gear carb
    1  ( 1 )  " " " "  " " " "  "*" " "  " " " " " "  " " 
    2  ( 1 )  "*" " "  " " " "  "*" " "  " " " " " "  " " 
    3  ( 1 )  "*" " "  "*" " "  "*" " "  " " " " " "  " " 
    4  ( 1 )  "*" " "  "*" " "  "*" " "  " " "*" " "  " " 
    5  ( 1 )  "*" " "  "*" " "  "*" "*"  " " "*" " "  " " 
    6  ( 1 )  "*" "*"  "*" " "  "*" "*"  " " "*" " "  " " 
    7  ( 1 )  "*" "*"  "*" "*"  "*" "*"  " " "*" " "  " " 
    8  ( 1 )  "*" "*"  "*" "*"  "*" "*"  " " "*" "*"  " " 
    9  ( 1 )  "*" "*"  "*" "*"  "*" "*"  " " "*" "*"  "*" 
    10  ( 1 ) "*" "*"  "*" "*"  "*" "*"  "*" "*" "*"  "*" 
    ForwardBic <- summary(df_leaps)$bic
    Min.val <- which.min(ForwardBic)
    plot(ForwardBic, type='b')
    points(Min.val, BestBic[Min.val], cex=5, col='red', pch=20)

    전진선택법 일 때 가장 낮은 값

    # 최적의 모델 구현
    coef(df_leaps, 2)
    (Intercept)
    39.686261480253
    cyl
    -1.5077949682598
    wt
    -3.19097213898375

     

    후진제거법

    # regsubsets() - BIC method='backward'
    df_leaps <- regsubsets(mpg~., data=df, nvmax=10, method='backward')
    # 결과 확인 
    # 아래에서 위로 결과 확인
    summary(df_leaps)
    Subset selection object
    Call: regsubsets.formula(mpg ~ ., data = df, nvmax = 10, method = "backward")
    10 Variables  (and intercept)
         Forced in Forced out
    cyl      FALSE      FALSE
    disp     FALSE      FALSE
    hp       FALSE      FALSE
    drat     FALSE      FALSE
    wt       FALSE      FALSE
    qsec     FALSE      FALSE
    vs       FALSE      FALSE
    am       FALSE      FALSE
    gear     FALSE      FALSE
    carb     FALSE      FALSE
    1 subsets of each size up to 10
    Selection Algorithm: backward
              cyl disp hp  drat wt  qsec vs  am  gear carb
    1  ( 1 )  " " " "  " " " "  "*" " "  " " " " " "  " " 
    2  ( 1 )  " " " "  " " " "  "*" "*"  " " " " " "  " " 
    3  ( 1 )  " " " "  " " " "  "*" "*"  " " "*" " "  " " 
    4  ( 1 )  " " " "  "*" " "  "*" "*"  " " "*" " "  " " 
    5  ( 1 )  " " "*"  "*" " "  "*" "*"  " " "*" " "  " " 
    6  ( 1 )  " " "*"  "*" "*"  "*" "*"  " " "*" " "  " " 
    7  ( 1 )  " " "*"  "*" "*"  "*" "*"  " " "*" "*"  " " 
    8  ( 1 )  " " "*"  "*" "*"  "*" "*"  " " "*" "*"  "*" 
    9  ( 1 )  " " "*"  "*" "*"  "*" "*"  "*" "*" "*"  "*" 
    10  ( 1 ) "*" "*"  "*" "*"  "*" "*"  "*" "*" "*"  "*" 
    BackwardBic <- summary(df_leaps)$bic
    Min.val <- which.min(BackwardBic)
    plot(BackwardBic, type='b')
    points(Min.val, BackwardBic[Min.val], cex=5, col='red', pch=20)

    후진제거법 일 떄 가장 작은 값

    # 최적의 모델 구현
    coef(df_leaps, 3)
    (Intercept)
    9.61778051456161
    wt
    -3.91650372494249
    qsec
    1.2258859715837
    am
    2.93583719188942

     

    명목형 변수 해석하기

    • 문자형 -> 숫자형

    • 없다 0, 있다1 바꾸어서 의미를 파악

    • 남자 0, 여자 1 y = 2 + 4x => 남자라면 2, 여자면 6이됨

    head(mtcars)
     
      mpg cyl disp hp drat wt qsec vs am gear carb
      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
    Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
    Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
    Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
    Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
    Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
    # am  0 auto, 1 수동 => 수동이면 연비가 올라감 7.245
    lm(mpg ~ am, data = mtcars)
    Call:
    lm(formula = mpg ~ am, data = mtcars)
    
    Coefficients:
    (Intercept)           am  
         17.147        7.245  

    다중 명목형 변수

    install.packages('car')
    also installing the dependencies 'matrixStats', 'RcppArmadillo', 'zip', 'SparseM', 'MatrixModels', 'conquer', 'sp', 'openxlsx', 'minqa', 'nloptr', 'statmod', 'RcppEigen', 'carData', 'abind', 'pbkrtest', 'quantreg', 'maptools', 'rio', 'lme4'
    
    
    
    
    package 'matrixStats' successfully unpacked and MD5 sums checked
    package 'RcppArmadillo' successfully unpacked and MD5 sums checked
    package 'zip' successfully unpacked and MD5 sums checked
    package 'SparseM' successfully unpacked and MD5 sums checked
    package 'MatrixModels' successfully unpacked and MD5 sums checked
    package 'conquer' successfully unpacked and MD5 sums checked
    package 'sp' successfully unpacked and MD5 sums checked
    package 'openxlsx' successfully unpacked and MD5 sums checked
    package 'minqa' successfully unpacked and MD5 sums checked
    package 'nloptr' successfully unpacked and MD5 sums checked
    package 'statmod' successfully unpacked and MD5 sums checked
    package 'RcppEigen' successfully unpacked and MD5 sums checked
    package 'carData' successfully unpacked and MD5 sums checked
    package 'abind' successfully unpacked and MD5 sums checked
    package 'pbkrtest' successfully unpacked and MD5 sums checked
    package 'quantreg' successfully unpacked and MD5 sums checked
    package 'maptools' successfully unpacked and MD5 sums checked
    package 'rio' successfully unpacked and MD5 sums checked
    package 'lme4' successfully unpacked and MD5 sums checked
    package 'car' successfully unpacked and MD5 sums checked
    
    The downloaded binary packages are in
        C:\Users\205\AppData\Local\Temp\Rtmpk3oBnd\downloaded_packages
    library(car)
    Warning message:
    "package 'car' was built under R version 4.0.2"
    Loading required package: carData
    head(Vocab, 3)
     
      year sex education vocabulary
      <dbl> <fct> <dbl> <dbl>
    19740001 1974 Male 14 9
    19740002 1974 Male 16 9
    19740003 1974 Female 10 9
    str(Vocab)
    'data.frame':    30351 obs. of  4 variables:
     $ year      : num  1974 1974 1974 1974 1974 ...
     $ sex       : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
     $ education : num  14 16 10 10 12 16 17 10 12 11 ...
     $ vocabulary: num  9 9 9 5 8 8 9 5 3 5 ...
     - attr(*, "na.action")= 'omit' Named int [1:32115] 1 2 3 4 5 6 7 8 9 10 ...
      ..- attr(*, "names")= chr [1:32115] "19720001" "19720002" "19720003" "19720004" ...
    # year 관측연도, sex 성별, education 교육횟수, vocabulary 단어시험점수
    summary(Vocab)
          year          sex          education       vocabulary    
     Min.   :1974   Female:17148   Min.   : 0.00   Min.   : 0.000  
     1st Qu.:1987   Male  :13203   1st Qu.:12.00   1st Qu.: 5.000  
     Median :1994                  Median :12.00   Median : 6.000  
     Mean   :1995                  Mean   :13.03   Mean   : 6.004  
     3rd Qu.:2006                  3rd Qu.:15.00   3rd Qu.: 7.000  
     Max.   :2016                  Max.   :20.00   Max.   :10.000  
    ?Vocab
    Lm <- lm(Vocab$vocabulary~Vocab$sex+Vocab$education, data = Vocab)
    Lm
    Call:
    lm(formula = Vocab$vocabulary ~ Vocab$sex + Vocab$education, 
        data = Vocab)
    
    Coefficients:
        (Intercept)    Vocab$sexMale  Vocab$education  
             1.7366          -0.1687           0.3330  

    Vocab$sexMale 남자이면 점수 -0.1687 낮아짐

    교육 1번에 0.333점 올라감

    Chap06-2.html
    0.45MB

    'R' 카테고리의 다른 글

    회귀분석  (0) 2020.07.13
    확률 - 예제  (0) 2020.07.13
    확률  (0) 2020.07.13
    시각화 - D3.js  (0) 2020.07.09
    시각화 - 예제  (0) 2020.07.09
Designed by Tistory.