The exercises are part of the Fundamentals of R course. For more, see the R for the Rest of Us website.
Let’s load the packages we need. These include tidyverse
(especially the dplyr
package) and janitor
.
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.0.1
## ✔ tidyr 0.8.3.9000 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Import your data into a data frame called NHANES. Use clean_names
to make your variable names easy to work with.
nhanes <- read_csv("data/nhanes.csv") %>%
clean_names()
## Parsed with column specification:
## cols(
## .default = col_character(),
## ID = col_double(),
## Age = col_double(),
## Weight = col_double(),
## Height = col_double(),
## BMI = col_double(),
## DaysPhysHlthBad = col_double(),
## DaysMentHlthBad = col_double(),
## SleepHrsNight = col_double(),
## PhysActiveDays = col_double(),
## TVHrsDay = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 4859 parsing failures.
## row col expected actual file
## 5001 TVHrsDay 1/0/T/F/TRUE/FALSE 2_hr 'data/nhanes.csv'
## 5002 TVHrsDay 1/0/T/F/TRUE/FALSE More_4_hr 'data/nhanes.csv'
## 5003 TVHrsDay 1/0/T/F/TRUE/FALSE 4_hr 'data/nhanes.csv'
## 5004 TVHrsDay 1/0/T/F/TRUE/FALSE 4_hr 'data/nhanes.csv'
## 5005 TVHrsDay 1/0/T/F/TRUE/FALSE 1_hr 'data/nhanes.csv'
## .... ........ .................. ......... .................
## See problems(...) for more details.
Make a scatterplot that shows weight on the x axis and height on the y axis.
ggplot(data = nhanes,
mapping = aes(x = weight,
y = height)) +
geom_point()
## Warning: Removed 366 rows containing missing values (geom_point).
Make a histogram that shows the distribution of the weight variable.
ggplot(data = nhanes,
mapping = aes(x = weight)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 78 rows containing non-finite values (stat_bin).
Copy your code from above, but adjust it so that there are 50 bins.
ggplot(data = nhanes,
mapping = aes(x = weight)) +
geom_histogram(bins = 50)
## Warning: Removed 78 rows containing non-finite values (stat_bin).
Use the v1 approach to make a bar chart that shows a count of the number of people who say they smoke. Include NA responses.
ggplot(data = nhanes,
mapping = aes(x = smoke_now)) +
geom_bar()
Create a new data frame called sleep_by_gender
that shows the average amount of sleep per night that males and females report getting. Drop any NA (or NaN) responses from this data frame.
sleep_by_gender <- nhanes %>%
group_by(gender) %>%
summarize(avg_sleep = mean(sleep_hrs_night, na.rm = TRUE))
Plot the average amount of sleep per night for males and females.
ggplot(data = sleep_by_gender,
mapping = aes(x = gender,
y = avg_sleep)) +
geom_bar(stat = "identity")
Make the same graph as above, but use geom_col
instead of geom_bar
.
ggplot(data = sleep_by_gender,
mapping = aes(x = gender,
y = avg_sleep)) +
geom_col()
color
and fill
Take your graph from above (the one with geom_col
) and make the inside of each bar a different color.
ggplot(data = sleep_by_gender,
mapping = aes(x = gender,
y = avg_sleep,
fill = gender)) +
geom_col()
Make your scatterplot from before that shows weight on the x axis and height on the y axis again, but make the dots show up in different colors based on the phys_active
variable.
ggplot(data = nhanes,
mapping = aes(x = weight,
y = height,
color = phys_active)) +
geom_point()
## Warning: Removed 366 rows containing missing values (geom_point).
Take your scatterplot that you just made and add a scale using scale_color_brewer
. Take a look at the help docs and choose a palette other than the default (hint: look at the palette
argument).
ggplot(data = nhanes,
mapping = aes(x = weight,
y = height,
color = phys_active)) +
geom_point() +
scale_color_brewer(palette = "Dark2")
## Warning: Removed 1750 rows containing missing values (geom_point).
Do nearly the same thing to change the color of the last bar chart you made (the one about sleep and gender).
ggplot(data = sleep_by_gender,
mapping = aes(x = gender,
y = avg_sleep,
fill = gender)) +
geom_col() +
scale_fill_brewer(palette = "Dark2")
Copy the graph you just made and change the y axis so it goes from 0 to 8.
ggplot(data = sleep_by_gender,
mapping = aes(x = gender,
y = avg_sleep,
fill = gender)) +
geom_col() +
scale_fill_brewer(palette = "Dark2") +
scale_y_continuous(limits = c(0, 8))
Copy the last code chunk. Then adjust the breaks on the y axis so that it shows 0 to 8 by 1 (i.e. 0, 1, 2, etc).
ggplot(data = sleep_by_gender,
mapping = aes(x = gender,
y = avg_sleep,
fill = gender)) +
geom_col() +
scale_fill_brewer(palette = "Dark2") +
scale_y_continuous(limits = c(0, 8),
breaks = c(0, 1, 2, 3, 4, 5, 6, 7, 8))
Copy your last code chunk. Then do the following:
round
argument to just show one decimal place in each label.vjust
argument to have them show up at the inner edge of the bars.ggplot(data = sleep_by_gender,
mapping = aes(x = gender,
y = avg_sleep,
fill = gender)) +
geom_col() +
geom_text(aes(label = round(avg_sleep, 1)),
vjust = 1.5,
color = "white") +
scale_fill_brewer(palette = "Dark2",
na.value = "blue") +
scale_y_continuous(limits = c(0, 8),
breaks = c(0, 1, 2, 3, 4, 5, 6, 7, 8))
Do the same thing as above, but use geom_label
instead of geom_text
. Also, do the following:
vjust
argument to have them show up at the outer edge of the bars.ggplot(data = sleep_by_gender,
mapping = aes(x = gender,
y = avg_sleep,
fill = gender)) +
geom_col(show.legend = FALSE) +
geom_label(aes(label = round(avg_sleep, 1)),
vjust = -1.1,
show.legend = FALSE,
color = "white") +
scale_fill_brewer(palette = "Dark2",
na.value = "blue") +
scale_y_continuous(limits = c(0, 8),
breaks = c(0, 1, 2, 3, 4, 5, 6, 7, 8))
Use the code chunk from above with geom_text
(not the last one with geom_label
). Do the following:
show.legend
argument again)ggplot(data = sleep_by_gender,
mapping = aes(x = gender,
y = avg_sleep,
fill = gender)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = round(avg_sleep, 1)),
vjust = 1.5,
color = "white") +
scale_fill_brewer(palette = "Dark2",
na.value = "blue") +
scale_y_continuous(limits = c(0, 8),
breaks = c(0, 1, 2, 3, 4, 5, 6, 7, 8)) +
labs(title = "Women sleep slightly more than men on average",
y = "Hours of sleep per night",
x = "")
Install and load the hrbrthemes
package. It’s a package that provides some great default themes.
It’s not available on CRAN, where we normally install packages from, which means you have to install it slightly differently. You’ll use the devtools
package and then use this to install the hrbrthemes
package. Use the code below.
# install.packages("devtools")
# devtools::install_github("hrbrmstr/hrbrthemes")
library(hrbrthemes)
Then add the theme_ipsum
to your plot.
ggplot(data = sleep_by_gender,
mapping = aes(x = gender,
y = avg_sleep,
fill = gender)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = round(avg_sleep, 1)),
vjust = 1.5,
color = "white") +
scale_fill_brewer(palette = "Dark2",
na.value = "blue") +
scale_y_continuous(limits = c(0, 8),
breaks = c(0, 1, 2, 3, 4, 5, 6, 7, 8)) +
labs(title = "Women sleep slightly more than men on average",
y = "Hours of sleep per night",
x = "") +
theme_ipsum()
I’ve created a data frame called sleep_by_gender_by_age
for you. Run the code chunk below to load the data frame.
sleep_by_gender_by_age <- nhanes %>%
group_by(gender, age_decade) %>%
summarize(avg_sleep = mean(sleep_hrs_night, na.rm = TRUE)) %>%
drop_na()
Let’s take a look at sleep_by_gender_by_age
.
sleep_by_gender_by_age
Now, see if you can recreate this plot. Much of the code will be the same from your previous plots using the sleep_by_gender
data frame so just make some small changes.
ggplot(data = sleep_by_gender_by_age,
aes(x = age_decade,
y = avg_sleep,
fill = gender)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = round(avg_sleep, 1)),
vjust = 1.5,
color = "white") +
scale_y_continuous(limits = c(0, 8),
breaks = seq(0, 8, by = 1)) +
scale_fill_brewer(palette = "Dark2") +
facet_wrap(~gender) +
theme_ipsum() +
labs(title = "Sleep by gender and age",
y = "Hours of sleep per night",
x = "")
Save your last plot to a PNG that is 8 inches wide and 5 inches high. Put it in the plots directory and call it “my-sleep-plot.png”
ggsave(filename = "plots/my-sleep-plot.png",
height = 5,
width = 8,
unit = "in")