Make sure loading tidyverse:
library("tidyverse")
# a regular data frame
head(iris)
as_tibble(iris)
as.data.frame(tb)
tibble(
x = 1:5,
y = 1,
z = x ^ 2 + y
)
tribble(
~x, ~y, ~z,
#--|--|----
"a", 2, 3.6,
"b", 1, 8.5
)
tibble(
a = lubridate::now() + runif(1e3) * 86400,
b = lubridate::today() + runif(1e3) * 30,
c = 1:1e3,
d = runif(1e3),
e = sample(letters, 1e3, replace = TRUE) # letters() == 'a' -- 'z'
)
nycflights13::flights %>%
print(n = 10, width = Inf)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 12 UA 1696 N39463 EWR ORD 150 719 5 58
## 7 19 B6 507 N516JB EWR FLL 158 1065 6 0
## 8 -14 EV 5708 N829AS LGA IAD 53 229 6 0
## 9 -8 B6 79 N593JB JFK MCO 140 944 6 0
## 10 8 AA 301 N3ALAA LGA ORD 138 733 6 0
## time_hour
## <dttm>
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 05:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-01 05:00:00
## 7 2013-01-01 06:00:00
## 8 2013-01-01 06:00:00
## 9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # … with 336,766 more rows
options(tibble.print_max = n, tibble.print_min = m)
: if more than m
rows, print only n
rows.options(dplyr.print_min = Inf)
: print all row.options(tibble.width = Inf)
: print all columns. df <- tibble(
x = runif(5),
y = rnorm(5)
)
df$x
## [1] 0.5289031 0.8118254 0.4098623 0.5726840 0.2423287
df[["x"]]
## [1] 0.5289031 0.8118254 0.4098623 0.5726840 0.2423287
df[[1]]
## [1] 0.5289031 0.8118254 0.4098623 0.5726840 0.2423287
.
: df %>% .$x
## [1] 0.5289031 0.8118254 0.4098623 0.5726840 0.2423287
df %>% .[["x"]]
## [1] 0.5289031 0.8118254 0.4098623 0.5726840 0.2423287
Let’s start working with our own data. We’ll learn how to read plain-text rectangular files into R.
readr package implements functions that turn flat files into tibbles.
read_csv()
(comma delimited files), read_csv2()
(semicolon seperated files), read_tsv()
(tab delimited files), read_delim()
(files with any delimiter).
read_fwf()
(fixed width files), read_table()
(fixed width files where columns are separated by white space).
read_log()
(Apache style log files).
An example file heights.csv
is in this link:
head ./data/heights.csv
## "earn","height","sex","ed","age","race"
## 50000,74.4244387818035,"male",16,45,"white"
## 60000,65.5375428255647,"female",16,58,"white"
## 30000,63.6291977374349,"female",16,29,"white"
## 50000,63.1085616752971,"female",16,91,"other"
## 51000,63.4024835710879,"female",17,39,"white"
## 9000,64.3995075440034,"female",15,26,"white"
## 29000,61.6563258264214,"female",12,49,"white"
## 32000,72.6985437364783,"male",17,46,"white"
## 2000,72.0394668497611,"male",15,21,"hispanic"
(heights <- read_csv("data/heights.csv"))
## Parsed with column specification:
## cols(
## earn = col_double(),
## height = col_double(),
## sex = col_character(),
## ed = col_double(),
## age = col_double(),
## race = col_character()
## )
read_csv("a,b,c
1,2,3
4,5,6")
n
lines: read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3", skip = 2)
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")
read_csv("1,2,3\n4,5,6", col_names = FALSE)
(`\n` is a convenient shortcut for adding a new line.)
col_names
: read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
read_csv("a,b,c\n1,2,.", na = ".")
parse_*()
functions take a character vector and return a more specialised vector like a logical, integer, or date: str(parse_logical(c("TRUE", "FALSE", "NA")))
## logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
## int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
## Date[1:2], format: "2010-01-01" "1979-10-14"
parse_integer(c("1", "231", ".", "456"), na = ".")
## [1] 1 231 NA 456
x <- parse_integer(c("123", "345", "abc", "123.45"))
## Warning: 2 parsing failures.
## row col expected actual
## 3 -- an integer abc
## 4 -- no trailing characters .45
x
## [1] 123 345 NA NA
## attr(,"problems")
## # A tibble: 2 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA an integer abc
## 2 4 NA no trailing characters .45
parse_double()
and parse_number()
.
# some countries use `.` in between the integer and fractional
# parts of a real number, while others use `,`.
parse_double("1.23")
## [1] 1.23
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
# Numbers are often surrounded by other characters that provide some context
parse_number("$100")
## [1] 100
parse_number("20%")
## [1] 20
parse_number("It cost $123.45")
## [1] 123.45
# “Grouping” characters to make numbers easier to read, like “1,000,000”,
# vary around the world.
parse_number("$123,456,789") # Used in America
## [1] 123456789
parse_number("123.456.789", locale = locale(grouping_mark = ".")) # Used in many parts of Europe
## [1] 123456789
parse_number("123'456'789", locale = locale(grouping_mark = "'")) # Used in Switzerland
## [1] 123456789
Complications: there are multiple ways to represent the same string.
charToRaw("Hadley")
## [1] 48 61 64 6c 65 79
The mapping from hexadecimal number to character is called the encoding.
Modern strings are almost always encoded in UTF-8, but there still are data produced by older systems that don’t understand UTF-8:
x1 <- "El Ni\xf1o was particularly bad this year" # Latin1 (aka ISO-8859-1)
x2 <- "\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4" # EUC-KR (Korean)
x1
## [1] "El Ni\xf1o was particularly bad this year"
x2
## [1] "\xbeȳ\xe7\xc7\u03fc\xbc\xbf\xe4"
To fix the problem you need to specify the encoding in parse_character()
:
parse_character(x1, locale = locale(encoding = "Latin1"))
## [1] "El Niño was particularly bad this year"
parse_character(x2, locale = locale(encoding = "EUC-KR"))
## [1] "안녕하세요"
R uses factors to represent categorical variables that have a known set of possible values:
fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
## Warning: 1 parsing failure.
## row col expected actual
## 3 -- value in level set bananana
## [1] apple banana <NA>
## attr(,"problems")
## # A tibble: 1 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA value in level set bananana
## Levels: apple banana
Computer representation of time: 1. date (the number of days since 1970-01-01) 2. date-time (the number of seconds since midnight 1970-01-01) 3. time (the number of seconds since midnight 1970-01-01)
parse_datetime()
expects an ISO8601 date-time. ISO8601 is an international standard in which the components of a date are organised from biggest to smallest: year, month, day, hour, minute, second. parse_datetime("2010-10-01T2010")
## [1] "2010-10-01 20:10:00 UTC"
# If time is omitted, it will be set to midnight
parse_datetime("20101010")
## [1] "2010-10-10 UTC"
parse_date()
expects a four digit year, a -
or /
, the month, a -
or /
, then the day: parse_date("2010-10-01")
## [1] "2010-10-01"
parse_time()
expects the hour, :
, minutes, optionally :
and seconds, and an optional am/pm specifier: library(hms)
parse_time("01:10 am")
## 01:10:00
parse_time("20:10:01")
## 20:10:01
Heuristic used by readr: it reads the first 1000 rows and tries each of the following types, stopping when it finds a match:
-
).4.5e-5
).time_format
.date_format
.If none of these rules apply, then the column will stay as a vector of strings.
First 1000 rows are not enough.
Missing values.
challenge <- read_csv(readr_example("challenge.csv"))
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_logical()
## )
## Warning: 1000 parsing failures.
## row col expected actual file
## 1001 y 1/0/T/F/TRUE/FALSE 2015-01-16 '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/readr/extdata/challenge.csv'
## 1002 y 1/0/T/F/TRUE/FALSE 2018-05-18 '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/readr/extdata/challenge.csv'
## 1003 y 1/0/T/F/TRUE/FALSE 2015-09-05 '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/readr/extdata/challenge.csv'
## 1004 y 1/0/T/F/TRUE/FALSE 2012-11-28 '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/readr/extdata/challenge.csv'
## 1005 y 1/0/T/F/TRUE/FALSE 2020-01-13 '/Library/Frameworks/R.framework/Versions/4.0/Resources/library/readr/extdata/challenge.csv'
## .... ... .................. .......... ............................................................................................
## See problems(...) for more details.
problems(challenge)
A good strategy is to work column by column until there are no problems remaining.
challenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_character()
)
)
y
appears to be dates stored in a character vector: tail(challenge)
challenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_date()
)
)
tail(challenge)
challenge2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001)
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_date(format = "")
## )
challenge2
challenge2 <- read_csv(readr_example("challenge.csv"),
col_types = cols(.default = col_character())
)
and apply `type_convert()`:
df <- tribble(
~x, ~y,
"1", "1.21",
"2", "2.32",
"3", "4.56"
)
df
# Note the column types
type_convert(df)
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_double()
## )
write_csv(challenge, "challenge.csv")
write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")
haven reads SPSS, Stata, and SAS files.
readxl reads excel files (both .xls and .xlsx).
DBI, along with a database specific backend (e.g. RMySQL, RSQLite, RPostgreSQL etc) allows you to run SQL queries against a database and return a data frame.
jsonlite reads JSON files; JSON is a standard data exchange format on the web.
xml2 reads XML files.
There are three interrelated rules which make a dataset tidy:
Each variable must have its own column.
Each observation must have its own row.
Each value must have its own cell.
table1
is tidy.
table2
is not tidy.
table3
is not tidy.
table4a
is not tidy.
table4b
is not tidy.
dplyr, ggplot2, and all the other packages in the tidyverse are designed to work with tidy data.
# Compute rate per 10,000
table1 %>%
mutate(rate = cases / population * 10000)
# Compute cases per year
table1 %>%
count(year, wt = cases)
# Visualise changes over time
ggplot(table1, aes(year, cases)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country))
Most data that you will encounter will be untidy.
One variable might be spread across multiple columns.
One observation might be scattered across multiple rows.
One variable might be spread across multiple columns.
gather()
columns into a new pair of variables. table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
+ __Key__ = name of the variable whose values form the column names
+ __Value__ = name of the variable whose values are spread over the cells
tidy4a <- table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
tidy4b <- table4b %>%
gather(`1999`, `2000`, key = "year", value = "population")
left_join(tidy4a, tidy4b) # we'll learn about this in Ch. 13
## Joining, by = c("country", "year")
One observation might be scattered across multiple rows.
spread(table2, key = type, value = count)
table3
table3 %>%
separate(rate, into = c("cases", "population"))
table3 %>%
separate(rate, into = c("cases", "population"), convert = TRUE)
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
table5
unite()
is the inverse of separate()
. table5 %>%
unite(new, century, year, sep = "")