진행하기에 앞서 데이터의 정보를 확인한다.
data<-read.csv("data/nasa.csv")
summary(data);str(data)
## Neo.Reference.ID Name Absolute.Magnitude Est.Dia.in.KM.min.
## Min. :2000433 Min. :2000433 Min. :11.16 Min. : 0.001011
## 1st Qu.:3097594 1st Qu.:3097594 1st Qu.:20.10 1st Qu.: 0.033462
## Median :3514799 Median :3514799 Median :21.90 Median : 0.110804
## Mean :3272298 Mean :3272298 Mean :22.27 Mean : 0.204604
## 3rd Qu.:3690060 3rd Qu.:3690060 3rd Qu.:24.50 3rd Qu.: 0.253837
## Max. :3781897 Max. :3781897 Max. :32.10 Max. :15.579552
## Est.Dia.in.KM.max. Est.Dia.in.M.min. Est.Dia.in.M.max.
## Min. : 0.00226 Min. : 1.011 Min. : 2.26
## 1st Qu.: 0.07482 1st Qu.: 33.462 1st Qu.: 74.82
## Median : 0.24777 Median : 110.804 Median : 247.77
## Mean : 0.45751 Mean : 204.604 Mean : 457.51
## 3rd Qu.: 0.56760 3rd Qu.: 253.837 3rd Qu.: 567.60
## Max. :34.83694 Max. :15579.552 Max. :34836.94
## Est.Dia.in.Miles.min. Est.Dia.in.Miles.max. Est.Dia.in.Feet.min.
## Min. :0.000628 Min. : 0.001404 Min. : 3.32
## 1st Qu.:0.020792 1st Qu.: 0.046493 1st Qu.: 109.78
## Median :0.068850 Median : 0.153954 Median : 363.53
## Mean :0.127135 Mean : 0.284283 Mean : 671.27
## 3rd Qu.:0.157727 3rd Qu.: 0.352688 3rd Qu.: 832.80
## Max. :9.680682 Max. :21.646663 Max. :51114.02
## Est.Dia.in.Feet.max. Epoch.Date.Close.Approach Relative.Velocity.km.per.sec
## Min. : 7.41 Min. :7.889e+11 Min. : 0.3355
## 1st Qu.: 245.49 1st Qu.:1.016e+12 1st Qu.: 8.4329
## Median : 812.88 Median :1.203e+12 Median :12.9179
## Mean : 1501.01 Mean :1.180e+12 Mean :13.9708
## 3rd Qu.: 1862.19 3rd Qu.:1.356e+12 3rd Qu.:18.0776
## Max. :114294.42 Max. :1.473e+12 Max. :44.6337
## Relative.Velocity.km.per.hr Miles.per.hour Miss.Dist..Astronomical.
## Min. : 1208 Min. : 750.5 Min. :0.0001779
## 1st Qu.: 30358 1st Qu.:18863.5 1st Qu.:0.1334196
## Median : 46504 Median :28896.0 Median :0.2650286
## Mean : 50295 Mean :31251.3 Mean :0.2567782
## 3rd Qu.: 65080 3rd Qu.:40437.9 3rd Qu.:0.3841541
## Max. :160681 Max. :99841.2 Max. :0.4998841
## Miss.Dist..lunar. Miss.Dist..kilometers. Miss.Dist..miles. Orbiting.Body
## Min. : 0.06919 Min. : 26610 Min. : 16535 Earth:4687
## 1st Qu.: 51.90021 1st Qu.:19959283 1st Qu.:12402124
## Median :103.09612 Median :39647712 Median :24635948
## Mean : 99.88671 Mean :38413467 Mean :23869022
## 3rd Qu.:149.43592 3rd Qu.:57468628 3rd Qu.:35709350
## Max. :194.45491 Max. :74781600 Max. :46467132
## Orbit.ID Orbit.Uncertainity Minimum.Orbit.Intersection
## Min. : 1.0 Min. :0.000 Min. :0.0000021
## 1st Qu.: 9.0 1st Qu.:0.000 1st Qu.:0.0145851
## Median : 16.0 Median :3.000 Median :0.0473655
## Mean : 28.3 Mean :3.517 Mean :0.0823201
## 3rd Qu.: 31.0 3rd Qu.:6.000 3rd Qu.:0.1235935
## Max. :611.0 Max. :9.000 Max. :0.4778910
## Jupiter.Tisserand.Invariant Epoch.Osculation Eccentricity
## Min. :2.196 Min. :2450165 Min. :0.007522
## 1st Qu.:4.050 1st Qu.:2458001 1st Qu.:0.240858
## Median :5.071 Median :2458001 Median :0.372450
## Mean :5.056 Mean :2457724 Mean :0.382569
## 3rd Qu.:6.019 3rd Qu.:2458001 3rd Qu.:0.512411
## Max. :9.025 Max. :2458020 Max. :0.960261
## Semi.Major.Axis Inclination Asc.Node.Longitude Orbital.Period
## Min. :0.6159 Min. : 0.01451 Min. : 0.0019 Min. : 176.6
## 1st Qu.:1.0006 1st Qu.: 4.96234 1st Qu.: 83.0812 1st Qu.: 365.6
## Median :1.2410 Median :10.31184 Median :172.6254 Median : 504.9
## Mean :1.4003 Mean :13.37384 Mean :172.1573 Mean : 635.6
## 3rd Qu.:1.6784 3rd Qu.:19.51168 3rd Qu.:255.0269 3rd Qu.: 794.2
## Max. :5.0720 Max. :75.40667 Max. :359.9059 Max. :4172.2
## Perihelion.Distance Perihelion.Arg Aphelion.Dist Perihelion.Time
## Min. :0.08074 Min. : 0.0069 Min. :0.8038 Min. :2450100
## 1st Qu.:0.63083 1st Qu.: 95.6259 1st Qu.:1.2661 1st Qu.:2457815
## Median :0.83315 Median :189.7616 Median :1.6182 Median :2457973
## Mean :0.81338 Mean :183.9322 Mean :1.9871 Mean :2457728
## 3rd Qu.:0.99723 3rd Qu.:271.7776 3rd Qu.:2.4512 3rd Qu.:2458108
## Max. :1.29983 Max. :359.9931 Max. :8.9839 Max. :2458839
## Mean.Anomaly Mean.Motion Equinox Hazardous
## Min. : 0.0032 Min. :0.08628 J2000:4687 Mode :logical
## 1st Qu.: 87.0069 1st Qu.:0.45329 FALSE:3932
## Median :185.7189 Median :0.71295 TRUE :755
## Mean :181.1679 Mean :0.73824
## 3rd Qu.:276.5319 3rd Qu.:0.98467
## Max. :359.9180 Max. :2.03900
## 'data.frame': 4687 obs. of 38 variables:
## $ Neo.Reference.ID : int 3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
## $ Name : int 3703080 3723955 2446862 3092506 3514799 3671135 2495323 2153315 2162463 2306383 ...
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.KM.min. : num 0.1272 0.1461 0.2315 0.0088 0.1272 ...
## $ Est.Dia.in.KM.max. : num 0.2845 0.3266 0.5177 0.0197 0.2845 ...
## $ Est.Dia.in.M.min. : num 127.2 146.1 231.5 8.8 127.2 ...
## $ Est.Dia.in.M.max. : num 284.5 326.6 517.7 19.7 284.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Est.Dia.in.Miles.max. : num 0.1768 0.203 0.3217 0.0122 0.1768 ...
## $ Est.Dia.in.Feet.min. : num 417.4 479.2 759.5 28.9 417.4 ...
## $ Est.Dia.in.Feet.max. : num 933.3 1071.6 1698.3 64.6 933.3 ...
## $ Epoch.Date.Close.Approach : num 7.89e+11 7.89e+11 7.90e+11 7.90e+11 7.90e+11 ...
## $ Relative.Velocity.km.per.sec: num 6.12 18.11 7.59 11.17 9.84 ...
## $ Relative.Velocity.km.per.hr : num 22017 65210 27327 40226 35427 ...
## $ Miles.per.hour : num 13681 40519 16980 24995 22013 ...
## $ Miss.Dist..Astronomical. : num 0.419 0.383 0.051 0.285 0.408 ...
## $ Miss.Dist..lunar. : num 163.2 149 19.8 111 158.6 ...
## $ Miss.Dist..kilometers. : num 62753692 57298148 7622912 42683616 61010824 ...
## $ Miss.Dist..miles. : num 38993336 35603420 4736658 26522368 37910368 ...
## $ Orbiting.Body : Factor w/ 1 level "Earth": 1 1 1 1 1 1 1 1 1 1 ...
## $ Orbit.ID : int 17 21 22 7 25 40 43 22 100 30 ...
## $ Orbit.Uncertainity : int 5 3 0 6 1 1 1 0 0 0 ...
## $ Minimum.Orbit.Intersection : num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Jupiter.Tisserand.Invariant : num 4.63 5.46 4.56 5.09 5.15 ...
## $ Epoch.Osculation : num 2458001 2458001 2458001 2458001 2458001 ...
## $ Eccentricity : num 0.426 0.352 0.348 0.217 0.21 ...
## $ Semi.Major.Axis : num 1.41 1.11 1.46 1.26 1.23 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Asc.Node.Longitude : num 314.4 136.7 259.5 57.2 84.6 ...
## $ Orbital.Period : num 610 426 644 514 496 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Perihelion.Arg : num 57.3 313.1 248.4 18.7 158.3 ...
## $ Aphelion.Dist : num 2.01 1.5 1.97 1.53 1.48 ...
## $ Perihelion.Time : num 2458162 2457795 2458120 2457902 2457814 ...
## $ Mean.Anomaly : num 264.8 173.7 292.9 68.7 135.1 ...
## $ Mean.Motion : num 0.591 0.845 0.559 0.7 0.726 ...
## $ Equinox : Factor w/ 1 level "J2000": 1 1 1 1 1 1 1 1 1 1 ...
## $ Hazardous : logi TRUE FALSE TRUE FALSE TRUE FALSE ...
nrow(data);table(data$Hazardous)
## [1] 4687
##
## FALSE TRUE
## 3932 755
table(is.na(data))
##
## FALSE
## 178106
data<-data[,-c(1,2)]
head(data)
## Absolute.Magnitude Est.Dia.in.KM.min. Est.Dia.in.KM.max. Est.Dia.in.M.min.
## 1 21.6 0.127219879 0.28447230 127.219878
## 2 21.3 0.146067964 0.32661790 146.067964
## 3 20.3 0.231502122 0.51765448 231.502122
## 4 27.4 0.008801465 0.01968068 8.801465
## 5 21.6 0.127219879 0.28447230 127.219878
## 6 19.6 0.319561887 0.71456210 319.561887
## Est.Dia.in.M.max. Est.Dia.in.Miles.min. Est.Dia.in.Miles.max.
## 1 284.47230 0.079050743 0.1767628
## 2 326.61790 0.090762397 0.2029509
## 3 517.65448 0.143848705 0.3216555
## 4 19.68067 0.005468975 0.0122290
## 5 284.47230 0.079050743 0.1767628
## 6 714.56210 0.198566489 0.4440082
## Est.Dia.in.Feet.min. Est.Dia.in.Feet.max. Epoch.Date.Close.Approach
## 1 417.3881 933.30809 7.88947e+11
## 2 479.2256 1071.58106 7.88947e+11
## 3 759.5214 1698.34153 7.89552e+11
## 4 28.8762 64.56914 7.90157e+11
## 5 417.3881 933.30809 7.90157e+11
## 6 1048.4314 2344.36393 7.90157e+11
## Relative.Velocity.km.per.sec Relative.Velocity.km.per.hr Miles.per.hour
## 1 6.115834 22017.00 13680.51
## 2 18.113985 65210.35 40519.17
## 3 7.590711 27326.56 16979.66
## 4 11.173875 40225.95 24994.84
## 5 9.840831 35426.99 22012.95
## 6 10.808844 38911.84 24178.30
## Miss.Dist..Astronomical. Miss.Dist..lunar. Miss.Dist..kilometers.
## 1 0.41948253 163.17871 62753692
## 2 0.38301446 148.99263 57298148
## 3 0.05095602 19.82189 7622912
## 4 0.28532233 110.99039 42683616
## 5 0.40783217 158.64671 61010824
## 6 0.39278478 152.79327 58759768
## Miss.Dist..miles. Orbiting.Body Orbit.ID Orbit.Uncertainity
## 1 38993336 Earth 17 5
## 2 35603420 Earth 21 3
## 3 4736658 Earth 22 0
## 4 26522368 Earth 7 6
## 5 37910368 Earth 25 1
## 6 36511628 Earth 40 1
## Minimum.Orbit.Intersection Jupiter.Tisserand.Invariant Epoch.Osculation
## 1 0.0252819 4.634 2458001
## 2 0.1869350 5.457 2458001
## 3 0.0430579 4.557 2458001
## 4 0.0055118 5.093 2458001
## 5 0.0347980 5.154 2458001
## 6 0.2722130 4.724 2458001
## Eccentricity Semi.Major.Axis Inclination Asc.Node.Longitude Orbital.Period
## 1 0.4255491 1.407011 6.025981 314.37391 609.5998
## 2 0.3516743 1.107776 28.412996 136.71724 425.8693
## 3 0.3482483 1.458824 4.237961 259.47598 643.5802
## 4 0.2165783 1.255903 7.905894 57.17327 514.0821
## 5 0.2104479 1.225615 16.793382 84.62931 495.5978
## 6 0.5634411 1.323532 17.927751 178.97195 556.1606
## Perihelion.Distance Perihelion.Arg Aphelion.Dist Perihelion.Time Mean.Anomaly
## 1 0.8082589 57.25747 2.005764 2458162 264.83753
## 2 0.7181996 313.09197 1.497352 2457795 173.74111
## 3 0.9507910 248.41504 1.966857 2458120 292.89365
## 4 0.9839016 18.70770 1.527904 2457902 68.74101
## 5 0.9676866 158.26360 1.483543 2457814 135.14213
## 6 0.5777998 198.14597 2.069265 2458009 354.23737
## Mean.Motion Equinox Hazardous
## 1 0.5905514 J2000 TRUE
## 2 0.8453298 J2000 FALSE
## 3 0.5593708 J2000 TRUE
## 4 0.7002772 J2000 FALSE
## 5 0.7263954 J2000 TRUE
## 6 0.6472951 J2000 FALSE
해당 데이터 원본은 NASA에서 작성되었으며, 데이터의 목적은 환자의 diabete 보유여부를 예측하기 위함이다. X변수로 공전주기, 이심률 등이 있으며, Y변수로 위험혜성여부를 뜻하는 지시변수로써 Hazardous 변수가 있다. 1,2열은 혜성 ID이므로 제외하도록 한다. NA값은 없으며, 총 4687개의 데이터 중 755개가 Hazardous Asteroid로 추정된다.
str(data)
## 'data.frame': 4687 obs. of 36 variables:
## $ Absolute.Magnitude : num 21.6 21.3 20.3 27.4 21.6 19.6 19.6 19.2 17.8 21.5 ...
## $ Est.Dia.in.KM.min. : num 0.1272 0.1461 0.2315 0.0088 0.1272 ...
## $ Est.Dia.in.KM.max. : num 0.2845 0.3266 0.5177 0.0197 0.2845 ...
## $ Est.Dia.in.M.min. : num 127.2 146.1 231.5 8.8 127.2 ...
## $ Est.Dia.in.M.max. : num 284.5 326.6 517.7 19.7 284.5 ...
## $ Est.Dia.in.Miles.min. : num 0.07905 0.09076 0.14385 0.00547 0.07905 ...
## $ Est.Dia.in.Miles.max. : num 0.1768 0.203 0.3217 0.0122 0.1768 ...
## $ Est.Dia.in.Feet.min. : num 417.4 479.2 759.5 28.9 417.4 ...
## $ Est.Dia.in.Feet.max. : num 933.3 1071.6 1698.3 64.6 933.3 ...
## $ Epoch.Date.Close.Approach : num 7.89e+11 7.89e+11 7.90e+11 7.90e+11 7.90e+11 ...
## $ Relative.Velocity.km.per.sec: num 6.12 18.11 7.59 11.17 9.84 ...
## $ Relative.Velocity.km.per.hr : num 22017 65210 27327 40226 35427 ...
## $ Miles.per.hour : num 13681 40519 16980 24995 22013 ...
## $ Miss.Dist..Astronomical. : num 0.419 0.383 0.051 0.285 0.408 ...
## $ Miss.Dist..lunar. : num 163.2 149 19.8 111 158.6 ...
## $ Miss.Dist..kilometers. : num 62753692 57298148 7622912 42683616 61010824 ...
## $ Miss.Dist..miles. : num 38993336 35603420 4736658 26522368 37910368 ...
## $ Orbiting.Body : Factor w/ 1 level "Earth": 1 1 1 1 1 1 1 1 1 1 ...
## $ Orbit.ID : int 17 21 22 7 25 40 43 22 100 30 ...
## $ Orbit.Uncertainity : int 5 3 0 6 1 1 1 0 0 0 ...
## $ Minimum.Orbit.Intersection : num 0.02528 0.18693 0.04306 0.00551 0.0348 ...
## $ Jupiter.Tisserand.Invariant : num 4.63 5.46 4.56 5.09 5.15 ...
## $ Epoch.Osculation : num 2458001 2458001 2458001 2458001 2458001 ...
## $ Eccentricity : num 0.426 0.352 0.348 0.217 0.21 ...
## $ Semi.Major.Axis : num 1.41 1.11 1.46 1.26 1.23 ...
## $ Inclination : num 6.03 28.41 4.24 7.91 16.79 ...
## $ Asc.Node.Longitude : num 314.4 136.7 259.5 57.2 84.6 ...
## $ Orbital.Period : num 610 426 644 514 496 ...
## $ Perihelion.Distance : num 0.808 0.718 0.951 0.984 0.968 ...
## $ Perihelion.Arg : num 57.3 313.1 248.4 18.7 158.3 ...
## $ Aphelion.Dist : num 2.01 1.5 1.97 1.53 1.48 ...
## $ Perihelion.Time : num 2458162 2457795 2458120 2457902 2457814 ...
## $ Mean.Anomaly : num 264.8 173.7 292.9 68.7 135.1 ...
## $ Mean.Motion : num 0.591 0.845 0.559 0.7 0.726 ...
## $ Equinox : Factor w/ 1 level "J2000": 1 1 1 1 1 1 1 1 1 1 ...
## $ Hazardous : logi TRUE FALSE TRUE FALSE TRUE FALSE ...
table(data$Orbiting.Body)
##
## Earth
## 4687
table(data$Equinox)
##
## J2000
## 4687
head(data$Orbit.ID)
## [1] 17 21 22 7 25 40
head(data$Hazardous)
## [1] TRUE FALSE TRUE FALSE TRUE FALSE
Orbiting.Body, Equinox는 모두 동일한 값이므로 제외한다. Orbit.ID는 이름값이므로 제외한다. Hazardous는 잠시 떼어놓고, normalizing 후 1,0의 값을 가지는 지시변수로 변환하도록 한다.
library(tidyverse)
## Warning: 패키지 'tidyverse'는 R 버전 3.6.3에서 작성되었습니다
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.2.1 √ purrr 0.3.3
## √ tibble 2.1.3 √ dplyr 0.8.5
## √ tidyr 1.0.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## Warning: 패키지 'tidyr'는 R 버전 3.6.3에서 작성되었습니다
## Warning: 패키지 'readr'는 R 버전 3.6.3에서 작성되었습니다
## Warning: 패키지 'purrr'는 R 버전 3.6.3에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 3.6.3에서 작성되었습니다
## Warning: 패키지 'stringr'는 R 버전 3.6.3에서 작성되었습니다
## Warning: 패키지 'forcats'는 R 버전 3.6.3에서 작성되었습니다
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data<-data %>%
select(-Orbiting.Body,-Orbit.ID,-Equinox)
nor=function(x){(x-min(x))/(max(x)-min(x))}
ndata<-nor(data[1:32])
ndata<-mutate(ndata,hazardous=as.numeric(data$Hazardous=='TRUE'))
table(ndata$hazardous)
##
## 0 1
## 3932 755
summary(ndata)
## Absolute.Magnitude Est.Dia.in.KM.min. Est.Dia.in.KM.max.
## Min. :7.575e-12 Min. :6.860e-16 Min. :1.534e-15
## 1st Qu.:1.364e-11 1st Qu.:2.271e-14 1st Qu.:5.079e-14
## Median :1.486e-11 Median :7.521e-14 Median :1.682e-13
## Mean :1.511e-11 Mean :1.389e-13 Mean :3.105e-13
## 3rd Qu.:1.663e-11 3rd Qu.:1.723e-13 3rd Qu.:3.852e-13
## Max. :2.179e-11 Max. :1.057e-11 Max. :2.365e-11
## Est.Dia.in.M.min. Est.Dia.in.M.max. Est.Dia.in.Miles.min.
## Min. :6.860e-13 Min. :1.534e-12 Min. :4.260e-16
## 1st Qu.:2.271e-11 1st Qu.:5.079e-11 1st Qu.:1.411e-14
## Median :7.521e-11 Median :1.682e-10 Median :4.673e-14
## Mean :1.389e-10 Mean :3.105e-10 Mean :8.629e-14
## 3rd Qu.:1.723e-10 3rd Qu.:3.852e-10 3rd Qu.:1.071e-13
## Max. :1.057e-08 Max. :2.365e-08 Max. :6.571e-12
## Est.Dia.in.Miles.max. Est.Dia.in.Feet.min. Est.Dia.in.Feet.max.
## Min. :9.530e-16 Min. :2.250e-12 Min. :5.030e-12
## 1st Qu.:3.156e-14 1st Qu.:7.451e-11 1st Qu.:1.666e-10
## Median :1.045e-13 Median :2.467e-10 Median :5.517e-10
## Mean :1.930e-13 Mean :4.556e-10 Mean :1.019e-09
## 3rd Qu.:2.394e-13 3rd Qu.:5.652e-10 3rd Qu.:1.264e-09
## Max. :1.469e-11 Max. :3.469e-08 Max. :7.758e-08
## Epoch.Date.Close.Approach Relative.Velocity.km.per.sec
## Min. :0.5355 Min. :2.277e-13
## 1st Qu.:0.6893 1st Qu.:5.724e-12
## Median :0.8166 Median :8.768e-12
## Mean :0.8008 Mean :9.483e-12
## 3rd Qu.:0.9201 3rd Qu.:1.227e-11
## Max. :1.0000 Max. :3.029e-11
## Relative.Velocity.km.per.hr Miles.per.hour Miss.Dist..Astronomical.
## Min. :8.198e-10 Min. :5.094e-10 Min. :1.207e-16
## 1st Qu.:2.061e-08 1st Qu.:1.280e-08 1st Qu.:9.056e-14
## Median :3.156e-08 Median :1.961e-08 Median :1.799e-13
## Mean :3.414e-08 Mean :2.121e-08 Mean :1.743e-13
## 3rd Qu.:4.417e-08 3rd Qu.:2.745e-08 3rd Qu.:2.607e-13
## Max. :1.091e-07 Max. :6.777e-08 Max. :3.393e-13
## Miss.Dist..lunar. Miss.Dist..kilometers. Miss.Dist..miles.
## Min. :4.696e-14 Min. :1.806e-08 Min. :1.122e-08
## 1st Qu.:3.523e-11 1st Qu.:1.355e-05 1st Qu.:8.418e-06
## Median :6.998e-11 Median :2.691e-05 Median :1.672e-05
## Mean :6.780e-11 Mean :2.607e-05 Mean :1.620e-05
## 3rd Qu.:1.014e-10 3rd Qu.:3.901e-05 3rd Qu.:2.424e-05
## Max. :1.320e-10 Max. :5.076e-05 Max. :3.154e-05
## Orbit.Uncertainity Minimum.Orbit.Intersection Jupiter.Tisserand.Invariant
## Min. :0.000e+00 Min. :1.400e-18 Min. :1.491e-12
## 1st Qu.:0.000e+00 1st Qu.:9.899e-15 1st Qu.:2.749e-12
## Median :2.036e-12 Median :3.215e-14 Median :3.442e-12
## Mean :2.387e-12 Mean :5.587e-14 Mean :3.432e-12
## 3rd Qu.:4.072e-12 3rd Qu.:8.389e-14 3rd Qu.:4.085e-12
## Max. :6.109e-12 Max. :3.244e-13 Max. :6.126e-12
## Epoch.Osculation Eccentricity Semi.Major.Axis
## Min. :1.663e-06 Min. :5.106e-15 Min. :4.180e-13
## 1st Qu.:1.668e-06 1st Qu.:1.635e-13 1st Qu.:6.792e-13
## Median :1.668e-06 Median :2.528e-13 Median :8.423e-13
## Mean :1.668e-06 Mean :2.597e-13 Mean :9.504e-13
## 3rd Qu.:1.668e-06 3rd Qu.:3.478e-13 3rd Qu.:1.139e-12
## Max. :1.668e-06 Max. :6.518e-13 Max. :3.443e-12
## Inclination Asc.Node.Longitude Orbital.Period
## Min. :9.850e-15 Min. :1.320e-15 Min. :1.198e-10
## 1st Qu.:3.368e-12 1st Qu.:5.639e-11 1st Qu.:2.482e-10
## Median :6.999e-12 Median :1.172e-10 Median :3.427e-10
## Mean :9.077e-12 Mean :1.168e-10 Mean :4.314e-10
## 3rd Qu.:1.324e-11 3rd Qu.:1.731e-10 3rd Qu.:5.391e-10
## Max. :5.118e-11 Max. :2.443e-10 Max. :2.832e-09
## Perihelion.Distance Perihelion.Arg Aphelion.Dist
## Min. :5.480e-14 Min. :4.700e-15 Min. :5.455e-13
## 1st Qu.:4.282e-13 1st Qu.:6.491e-11 1st Qu.:8.593e-13
## Median :5.655e-13 Median :1.288e-10 Median :1.098e-12
## Mean :5.521e-13 Mean :1.248e-10 Mean :1.349e-12
## 3rd Qu.:6.769e-13 3rd Qu.:1.845e-10 3rd Qu.:1.664e-12
## Max. :8.822e-13 Max. :2.443e-10 Max. :6.098e-12
## Perihelion.Time Mean.Anomaly Mean.Motion hazardous
## Min. :1.663e-06 Min. :2.170e-15 Min. :5.856e-14 Min. :0.0000
## 1st Qu.:1.668e-06 1st Qu.:5.905e-11 1st Qu.:3.077e-13 1st Qu.:0.0000
## Median :1.668e-06 Median :1.261e-10 Median :4.839e-13 Median :0.0000
## Mean :1.668e-06 Mean :1.230e-10 Mean :5.011e-13 Mean :0.1611
## 3rd Qu.:1.668e-06 3rd Qu.:1.877e-10 3rd Qu.:6.683e-13 3rd Qu.:0.0000
## Max. :1.669e-06 Max. :2.443e-10 Max. :1.384e-12 Max. :1.0000
Y변수인 hazardous를 제외한 X변수들의 normalizing이 완료되었다.
1
- Training 데이터와 Test 데이터를 50:50의 비율로 분할하시오. (단, 시드번호는 학번의 뒤자리수 4개를 사용하시오)
set.seed(2059)
nobs=nrow(ndata)
i=sample(1:nobs,round(nobs*0.5))
train=ndata[i,]
test=ndata[-i,]
nrow(train);nrow(test)
## [1] 2344
## [1] 2343
2, 3
R 프로그램의 ‘neuralnet’ 명령어를 사용하여 신경망 분석을 수행하고자 한다. 단, hyper-parameter는 아래와 같이 조정한다. A. 은닉층은 1 ~ 3를 사용하고, B. 각 은닉층내에서 은닉노드는 2 ~ 5 개까지 사용한다. C. stepmax = 1e+06을 사용한다. D. 그외 parameter 값들은 default 값을 사용한다. E. 만약 신경망의 학습이 수렴하지 않아서 error가 나오는 경우에는 수렴하지 않았다고 보고서를 작성한다.
위 2번의 조건에 맞는 신경망을 training 데이터를 이용하여 생성하고, test 데이터를 이용하여 예측 정확도를 계산하고자 한다. 이때 예측정확도는 AUROC 값을 사용한다.
head(test[,33])
## [1] 1 0 1 0 0 0
library(neuralnet)
## Warning: 패키지 'neuralnet'는 R 버전 3.6.3에서 작성되었습니다
##
## 다음의 패키지를 부착합니다: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(pROC)
## Warning: 패키지 'pROC'는 R 버전 3.6.3에서 작성되었습니다
## Type 'citation("pROC")' for a citation.
##
## 다음의 패키지를 부착합니다: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
auc1<-c();auc2<-c();auc3<-c();auc4<-c()
for(i in 1:3){
nn<-neuralnet(hazardous~.,data=train, hidden=rep(2,times=i),stepmax=1e+06,act.fct='logistic',linear.output=F)
pred<-compute(nn,covariate=test[,-33])
auc1[i]<-roc(test$hazardous~as.vector(pred$net.result))$auc
print(head(pred$net.result),5)
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21619
## 4 0.21608
## 5 0.21609
## 6 0.21609
## 8 0.21592
## 9 0.21595
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21457
## 4 0.21456
## 5 0.21462
## 6 0.21461
## 8 0.21436
## 9 0.21447
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21308
## 4 0.21301
## 5 0.21302
## 6 0.21302
## 8 0.21294
## 9 0.21294
for(i in 1:3){
nn<-neuralnet(hazardous~.,data=train, hidden=rep(3,times=i),stepmax=1e+06,act.fct='logistic',linear.output=F)
pred<-compute(nn,covariate=test[,-33])
auc2[i]<-roc(test$hazardous~as.vector(pred$net.result))$auc
print(head(pred$net.result),5)
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21258
## 4 0.21251
## 5 0.21254
## 6 0.21253
## 8 0.21237
## 9 0.21241
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21142
## 4 0.21135
## 5 0.21138
## 6 0.21137
## 8 0.21122
## 9 0.21126
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21694
## 4 0.21683
## 5 0.21683
## 6 0.21683
## 8 0.21670
## 9 0.21671
for(i in 1:3){
nn<-neuralnet(hazardous~.,data=train, hidden=rep(4,times=i),stepmax=1e+06,act.fct='logistic',linear.output=F)
pred<-compute(nn,covariate=test[,-33])
auc3[i]<-roc(test$hazardous~as.vector(pred$net.result))$auc
print(head(pred$net.result),5)
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21198
## 4 0.21190
## 5 0.21191
## 6 0.21191
## 8 0.21176
## 9 0.21179
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.22090
## 4 0.22075
## 5 0.22075
## 6 0.22075
## 8 0.22056
## 9 0.22058
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21607
## 4 0.21596
## 5 0.21597
## 6 0.21597
## 8 0.21583
## 9 0.21584
for(i in 1:3){
nn<-neuralnet(hazardous~.,data=train, hidden=rep(5,times=i),stepmax=1e+06,act.fct='logistic',linear.output=F)
pred<-compute(nn,covariate=test[,-33])
auc4[i]<-roc(test$hazardous~as.vector(pred$net.result))$auc
print(head(pred$net.result),5)
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21326
## 4 0.21317
## 5 0.21319
## 6 0.21319
## 8 0.21302
## 9 0.21306
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21706
## 4 0.21693
## 5 0.21694
## 6 0.21693
## 8 0.21677
## 9 0.21679
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## [,1]
## 3 0.21685
## 4 0.21672
## 5 0.21673
## 6 0.21672
## 8 0.21658
## 9 0.21659
총 12번의 nn신경망 분석을 실시하였고, 12개의 auc값을 얻었다.
4
- 3번의 결과에서 예측정확도가 가장 높은 최적의 hyper-parameter 조합은 무엇인지 밝히시오.
auc1;auc2;auc3;auc4
## [1] 0.5665675 0.5665688 0.5665675
## [1] 0.5665662 0.5665662 0.5665675
## [1] 0.5665662 0.5665675 0.5665675
## [1] 0.5665662 0.5665675 0.5665675
값은 모두 0.566 근처로 수렴하며, 그 중 가장 높은 값은 3행3열의 0.5665792이다. 따라서, 예측정확도가 가장 높은 조합은 3개의 은닉층과 3개의 은닉노드임을 확인할 수 있다.