suppressPackageStartupMessages({
  library(tidyverse)
  library(scales)
  library(glue)
  library(sf)
  library(albersusa)
})

1 Data

The raw data is taken from New York Times Github page.

df <-  read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv", col_types = cols())

2 Data Analysis

Plot the number of cases for California as a function of time (date)

df %>% 
  filter(state == "California") %>% 
  ggplot(aes(x = date, y = cases))+
  geom_line(color = "tomato")+
  scale_x_date(breaks = pretty_breaks(8))+
  #scale_y_continuous(labels = comma, breaks = pretty_breaks(8))+
  scale_y_log10(labels = comma_format(accuracy = 1), breaks = breaks_log(8))+
  labs(title = "Total number of cases in California", 
       x = "")+
  theme_bw()

Find the 6 states with the highest number of cases


n_states <- 6


# top_states <- 
#   df %>% 
#   group_by(state) %>% 
#   summarise(total_cases = max(cases)) %>% 
#   top_n(n_states, total_cases) %>% 
#   arrange(-total_cases)

top_states <- 
  df %>% 
  filter(date == max(date)) %>% 
  top_n(n_states, cases) %>% 
  arrange(-cases) %>% 
  select(state, cases, deaths) %>% 
  mutate(`Death to Cases Ratio` = percent(deaths/cases))

top_states %>% 
  knitr::kable(caption = glue("Top {n_states} states with the largest number of cases"), format = "pandoc")
Top 6 states with the largest number of cases
state cases deaths Death to Cases Ratio
New York 367625 29138 7.926%
New Jersey 155092 11144 7.185%
Illinois 112248 4912 4.376%
California 97017 3808 3.925%
Massachusetts 93271 6416 6.879%
Pennsylvania 72356 5159 7.130%

Plot the graph for the 6 states found above, where each state is a different color

df %>% 
  filter(state %in% top_states$state) %>% 
  ggplot(aes(x = date, y = cases, color = state))+
  geom_line()+
  scale_x_date(breaks = pretty_breaks(8))+
  #scale_y_continuous(labels = comma, breaks = pretty_breaks(8))+
  scale_y_log10(labels = comma_format(accuracy = 1), breaks = breaks_log(8))+
  labs(title = glue("Total number of cases in top {n_states} states"), 
       x = "")+
  theme_bw()

df %>% 
  filter(state %in% top_states$state) %>% 
  ggplot(aes(x = date, y = cases))+
  geom_area(aes(fill = state), alpha = 0.5, show.legend = FALSE)+
  scale_x_date(breaks = pretty_breaks(5))+
  scale_y_continuous(labels = comma, breaks = pretty_breaks(5))+
  #scale_y_log10(labels = comma_format(accuracy = 1), breaks = breaks_log(8))+
  labs(title = glue("Total number of cases in top {n_states} states"), 
       x = "")+
  theme_bw()+
  facet_wrap(~fct_reorder(state, cases,.fun = max,.desc = TRUE), scales = "free_y")

df %>% 
  filter(state %in% top_states$state) %>% 
  group_by(state) %>% 
  arrange(date) %>% 
  mutate(last_week_cases = cases - lag(cases,7)) %>% 
  drop_na(last_week_cases) %>% 
  ggplot(aes(x = cases, y = last_week_cases, color = state))+
  geom_area(aes(fill = state), alpha = 0.4, show.legend = FALSE)+
  scale_x_continuous(labels = comma, breaks = pretty_breaks(3))+
  scale_y_continuous(labels = comma, breaks = pretty_breaks(3))+
  #scale_y_log10(labels = comma_format(accuracy = 1), breaks = breaks_log(8))+
  labs(title = glue("Number of cases in top {n_states} states last week vs total"), 
       x = "Total Number of Cases",
       y = "Number of cases last week")+
  theme_bw()+
  facet_wrap(~fct_reorder(state, cases,.fun = max,.desc = TRUE), scales = "free")

df_last <- 
  df %>%
  filter(date == max(date)) %>% 
  select(date, state, fips, cases)

usa_data <- usa_sf()
  
usa_data %>% 
  mutate(long = map_dbl(geometry, ~st_centroid(.)[1]),
         lat = map_dbl(geometry, ~st_centroid(.)[2])) %>% 
  left_join(df_last, by = c("fips_state" = "fips", "name" = "state")) %>% 
  ggplot()+
  geom_sf(aes(fill = log10(cases)))+
  scale_fill_gradient(low = "mediumblue", high = "yellow")+
  geom_text(aes(x = long, y = lat, label  = iso_3166_2), color = "black", size =3)+
  theme_void()+
  labs(title = glue("Total Covid-19 cases in United States as of {max(df_last$date)}"))