This lecture note is based on Dr. Hua Zhou’s 2018 Winter Statistical Computing course notes available at http://hua-zhou.github.io/teaching/biostatm280-2018winter/index.html.
A typical data science project:
Tibbles extend data frames in R and form the core of tidyverse.
library("tidyverse")
Convert a regular data frame to tibble:
# a regular data frame
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
as_tibble(iris)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # ... with 140 more rows
Convert a tibble to data frame:
as.data.frame(tb)
Create tibble from individual vectors:
tibble(
x = 1:5,
y = 1,
z = x ^ 2 + y
)
## # A tibble: 5 x 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
## 2 2 1 5
## 3 3 1 10
## 4 4 1 17
## 5 5 1 26
Transposed tibbles:
tribble(
~x, ~y, ~z,
#--|--|----
"a", 2, 3.6,
"b", 1, 8.5
)
## # A tibble: 2 x 3
## x y z
## <chr> <dbl> <dbl>
## 1 a 2 3.6
## 2 b 1 8.5
By default, tibble prints the first 10 rows and all columns that fit on screen.
tibble(
a = lubridate::now() + runif(1e3) * 86400,
b = lubridate::today() + runif(1e3) * 30,
c = 1:1e3,
d = runif(1e3),
e = sample(letters, 1e3, replace = TRUE)
)
## # A tibble: 1,000 x 5
## a b c d e
## <dttm> <date> <int> <dbl> <chr>
## 1 2018-10-24 18:41:06 2018-11-02 1 0.0962 y
## 2 2018-10-25 08:50:30 2018-10-25 2 0.606 e
## 3 2018-10-25 06:06:12 2018-11-05 3 0.934 f
## 4 2018-10-24 13:48:46 2018-11-02 4 0.277 r
## 5 2018-10-24 14:48:15 2018-11-02 5 0.494 k
## 6 2018-10-24 21:29:55 2018-11-12 6 0.443 a
## 7 2018-10-24 11:20:00 2018-11-20 7 0.164 m
## 8 2018-10-24 15:12:38 2018-11-14 8 0.568 e
## 9 2018-10-25 06:14:28 2018-10-25 9 0.467 s
## 10 2018-10-25 00:26:01 2018-11-12 10 0.388 m
## # ... with 990 more rows
To change number of rows and columns to display:
nycflights13::flights %>%
print(n = 10, width = Inf)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## sched_arr_time arr_delay carrier flight tailnum origin dest air_time
## <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl>
## 1 819 11 UA 1545 N14228 EWR IAH 227
## 2 830 20 UA 1714 N24211 LGA IAH 227
## 3 850 33 AA 1141 N619AA JFK MIA 160
## 4 1022 -18 B6 725 N804JB JFK BQN 183
## 5 837 -25 DL 461 N668DN LGA ATL 116
## 6 728 12 UA 1696 N39463 EWR ORD 150
## 7 854 19 B6 507 N516JB EWR FLL 158
## 8 723 -14 EV 5708 N829AS LGA IAD 53
## 9 846 -8 B6 79 N593JB JFK MCO 140
## 10 745 8 AA 301 N3ALAA LGA ORD 138
## distance hour minute time_hour
## <dbl> <dbl> <dbl> <dttm>
## 1 1400 5 15 2013-01-01 05:00:00
## 2 1416 5 29 2013-01-01 05:00:00
## 3 1089 5 40 2013-01-01 05:00:00
## 4 1576 5 45 2013-01-01 05:00:00
## 5 762 6 0 2013-01-01 06:00:00
## 6 719 5 58 2013-01-01 05:00:00
## 7 1065 6 0 2013-01-01 06:00:00
## 8 229 6 0 2013-01-01 06:00:00
## 9 944 6 0 2013-01-01 06:00:00
## 10 733 6 0 2013-01-01 06:00:00
## # ... with 3.368e+05 more rows
options(tibble.print_max = n, tibble.print_min = m)
: if more than m
rows, print only n
rows.options(dplyr.print_min = Inf)
: print all row.options(tibble.width = Inf)
: print all columns.df <- tibble(
x = runif(5),
y = rnorm(5)
)
Extract by name:
df$x
## [1] 0.2543329 0.5079928 0.1287096 0.7832409 0.6120130
df[["x"]]
## [1] 0.2543329 0.5079928 0.1287096 0.7832409 0.6120130
Extract by position:
df[[1]]
## [1] 0.2543329 0.5079928 0.1287096 0.7832409 0.6120130
Pipe:
df %>% .$x
## [1] 0.2543329 0.5079928 0.1287096 0.7832409 0.6120130
df %>% .[["x"]]
## [1] 0.2543329 0.5079928 0.1287096 0.7832409 0.6120130
readr package implements functions that turn flat files into tibbles.
read_csv()
, read_csv2()
(semicolon seperated files), read_tsv()
, read_delim()
.
read_fwf()
(fixed width files), read_table()
.
read_log()
(Apache style log files).
An example file heights.csv:
head heights.csv
## "earn","height","sex","ed","age","race"
## 50000,74.4244387818035,"male",16,45,"white"
## 60000,65.5375428255647,"female",16,58,"white"
## 30000,63.6291977374349,"female",16,29,"white"
## 50000,63.1085616752971,"female",16,91,"other"
## 51000,63.4024835710879,"female",17,39,"white"
## 9000,64.3995075440034,"female",15,26,"white"
## 29000,61.6563258264214,"female",12,49,"white"
## 32000,72.6985437364783,"male",17,46,"white"
## 2000,72.0394668497611,"male",15,21,"hispanic"
Read from a local file heights.csv:
(heights <- read_csv("heights.csv"))
## Parsed with column specification:
## cols(
## earn = col_double(),
## height = col_double(),
## sex = col_character(),
## ed = col_integer(),
## age = col_integer(),
## race = col_character()
## )
## # A tibble: 1,192 x 6
## earn height sex ed age race
## <dbl> <dbl> <chr> <int> <int> <chr>
## 1 50000 74.4 male 16 45 white
## 2 60000 65.5 female 16 58 white
## 3 30000 63.6 female 16 29 white
## 4 50000 63.1 female 16 91 other
## 5 51000 63.4 female 17 39 white
## 6 9000 64.4 female 15 26 white
## 7 29000 61.7 female 12 49 white
## 8 32000 72.7 male 17 46 white
## 9 2000 72.0 male 15 21 hispanic
## 10 27000 72.2 male 12 26 white
## # ... with 1,182 more rows
Read from inline csv file:
read_csv("a,b,c
1,2,3
4,5,6")
## # A tibble: 2 x 3
## a b c
## <int> <int> <int>
## 1 1 2 3
## 2 4 5 6
Skip first n
lines:
read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3", skip = 2)
## # A tibble: 1 x 3
## x y z
## <int> <int> <int>
## 1 1 2 3
Skip comment lines:
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")
## # A tibble: 1 x 3
## x y z
## <int> <int> <int>
## 1 1 2 3
No header line:
read_csv("1,2,3\n4,5,6", col_names = FALSE)
## # A tibble: 2 x 3
## X1 X2 X3
## <int> <int> <int>
## 1 1 2 3
## 2 4 5 6
No header line and specify colnames:
read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
## # A tibble: 2 x 3
## x y z
## <int> <int> <int>
## 1 1 2 3
## 2 4 5 6
Specify the symbol representing missing values:
read_csv("a,b,c\n1,2,.", na = ".")
## # A tibble: 1 x 3
## a b c
## <int> <int> <chr>
## 1 1 2 <NA>
Write to csv:
write_csv(challenge, "challenge.csv")
Write (and read) RDS files:
write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")
haven reads SPSS, Stata, and SAS files.
readxl reads excel files (both .xls and .xlsx).
DBI, along with a database specific backend (e.g. RMySQL, RSQLite, RPostgreSQL etc) allows you to run SQL queries against a database and return a data frame.
jsonlite reads json files.
xml2 reads XML files.
There are three interrelated rules which make a dataset tidy:
Each variable must have its own column.
Each observation must have its own row.
Each value must have its own cell.
Example table1
table1
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
is tidy.
Example table2
table2
## # A tibble: 12 x 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
is not tidy.
Example table3
table3
## # A tibble: 6 x 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
is not tidy.
Example table4a
table4a
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
is not tidy.
Example table4b
table4b
## # A tibble: 3 x 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
is not tidy.
gather
columns into a new pair of variables.
table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
## # A tibble: 6 x 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Brazil 1999 37737
## 3 China 1999 212258
## 4 Afghanistan 2000 2666
## 5 Brazil 2000 80488
## 6 China 2000 213766
We can gather table4b too and then join them
tidy4a <- table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
tidy4b <- table4b %>%
gather(`1999`, `2000`, key = "year", value = "population")
left_join(tidy4a, tidy4b)
## Joining, by = c("country", "year")
## # A tibble: 6 x 4
## country year cases population
## <chr> <chr> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Brazil 1999 37737 172006362
## 3 China 1999 212258 1272915272
## 4 Afghanistan 2000 2666 20595360
## 5 Brazil 2000 80488 174504898
## 6 China 2000 213766 1280428583
Spreading is the opposite of gathering.
spread(table2, key = type, value = count)
## # A tibble: 6 x 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table3 %>%
separate(rate, into = c("cases", "population"))
## # A tibble: 6 x 4
## country year cases population
## * <chr> <int> <chr> <chr>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
Seperate into numeric values:
table3 %>%
separate(rate, into = c("cases", "population"), convert = TRUE)
## # A tibble: 6 x 4
## country year cases population
## * <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
Separate at a fixed position:
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
## # A tibble: 6 x 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
table5
## # A tibble: 6 x 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
unite()
is the inverse of separate()
.
table5 %>%
unite(new, century, year, sep = "")
## # A tibble: 6 x 3
## country new rate
## <chr> <chr> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583