ISYE 6501 Week 10
#ISYEꢀ 6501ꢀ Week HW
ꢀ 10ꢀ
# clear RStudio environmentꢀ
rm(list = ls())ꢀ
ꢀ
# import datasetꢀ
df.data <- read.table("~/Downloads/hw10-FA23/breast-cancer-
wisconsin.data.txt", header = TRUE,sep = ",", na.strings="?")ꢀ
ꢀ
# import column names from
http://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+originalꢀ
colnames(df.data) <- c("ID", "Clump_Thickness", "Cell_Size", "Cell_Shape",ꢀ
"Marg_Adhesion", "Single_Epith_Cell_Size",
"Bare_Nuclei", "Bland_Chromatin", ꢀ
"Normal_Nucleoli", "Mitoses", "Class")ꢀ
ꢀ
# convert data to data frameꢀ
df.data$Class <- as.factor(df.data$Class)ꢀ
levels(df.data$Class) <- c(0, 1)ꢀ
ꢀ
# summary of the dataꢀ
summary(df.data)ꢀ
## ID Clump_Thickness Cell_Size Cell_Shape ꢀ
## Min. : 61634 Min. : 1.000 Min. : 1.000 Min. : 1.000 ꢀ
## 1st Qu.: 870258 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000 ꢀ
## Median : 1171710 Median : 4.000 Median : 1.000 Median : 1.000 ꢀ
## Mean : 1071807 Mean : 4.417 Mean : 3.138 Mean : 3.211 ꢀ
## 3rd Qu.: 1238354 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000 ꢀ
## Max. :13454352 Max. :10.000 Max. :10.000 Max. :10.000 ꢀ
## ꢀ
## Marg_Adhesion Single_Epith_Cell_Size Bare_Nuclei Bland_Chromatin ꢀ
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000 ꢀ
## 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000 ꢀ
## Median : 1.000 Median : 2.000 Median : 1.000 Median : 3.000 ꢀ
## Mean : 2.809 Mean : 3.218 Mean : 3.548 Mean : 3.438 ꢀ
## 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000 ꢀ
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000 ꢀ
## NA's :16 ꢀ
## Normal_Nucleoli Mitoses Class ꢀ
## Min. : 1.00 Min. : 1.00 0:457 ꢀ
## 1st Qu.: 1.00 1st Qu.: 1.00 1:241 ꢀ
## Median : 1.00 Median : 1.00 ꢀ
## Mean : 2.87 Mean : 1.59 ꢀ
## 3rd Qu.: 4.00 3rd Qu.: 1.00 ꢀ
https://www.stuvia.com/user/nursecare
,https://www.stuvia.com/user/nursecare
## Max. :10.00 Max. :10.00 ꢀ
## ꢀ
# identify missing data in the datasetꢀ
df.data[is.na(df.data$Bare_Nuclei),]ꢀ
## ID Clump_Thickness Cell_Size Cell_Shape Marg_Adhesionꢀ
## 23 1057013 8 4 5 1ꢀ
## 40 1096800 6 6 6 9ꢀ
## 139 1183246 1 1 1 1ꢀ
## 145 1184840 1 1 3 1ꢀ
## 158 1193683 1 1 2 1ꢀ
## 164 1197510 5 1 1 1ꢀ
## 235 1241232 3 1 4 1ꢀ
## 249 169356 3 1 1 1ꢀ
## 275 432809 3 1 3 1ꢀ
## 292 563649 8 8 8 1ꢀ
## 294 606140 1 1 1 1ꢀ
## 297 61634 5 4 3 1ꢀ
## 315 704168 4 6 5 6ꢀ
## 321 733639 3 1 1 1ꢀ
## 411 1238464 1 1 1 1ꢀ
## 617 1057067 1 1 1 1ꢀ
## Single_Epith_Cell_Size Bare_Nuclei Bland_Chromatin Normal_Nucleoli
Mitosesꢀ
## 23 2 NA 7 3
1 ꢀ
## 40 6 NA 7 8
1ꢀ
## 139 1 NA 2 1
1ꢀ
## 145 2 NA 2 1
1ꢀ
## 158 3 NA 1 1
1ꢀ
## 164 2 NA 3 1
1ꢀ
## 235 2 NA 3 1
1ꢀ
## 249 2 NA 3 1
1ꢀ
## 275 2 NA 2 1
1ꢀ
## 292 2 NA 6 10
1ꢀ
## 294 2 NA 2 1
1ꢀ
## 297 2 NA 2 3
1ꢀ
## 315 7 NA 4 9
https://www.stuvia.com/user/nursecare