安裝、載入一些基本的套件

if(!require(dplyr)) install.packages("dplyr")
if(!require(tidyr)) install.packages("tidyr")
if(!require(ggplot2)) install.packages("ggplot2")
if(!require(plotly)) install.packages("plotly")
if(!require(babynames)) install.packages("babynames")

【Chapter-4】Case Study: US Babynames

檢視資料

babynames
# A tibble: 1,924,665 x 5
    year sex   name          n   prop
   <dbl> <chr> <chr>     <int>  <dbl>
 1  1880 F     Mary       7065 0.0724
 2  1880 F     Anna       2604 0.0267
 3  1880 F     Emma       2003 0.0205
 4  1880 F     Elizabeth  1939 0.0199
 5  1880 F     Minnie     1746 0.0179
 6  1880 F     Margaret   1578 0.0162
 7  1880 F     Ida        1472 0.0151
 8  1880 F     Alice      1414 0.0145
 9  1880 F     Bertha     1320 0.0135
10  1880 F     Sarah      1288 0.0132
# ... with 1,924,655 more rows


篩選出男生資料

mbaby = babynames %>% 
  rename(number = n) %>% 
  filter(year %in% seq(1880, 2017, 5), sex=="M")


某些男生名字的『數量』

mbaby %>% 
  filter(name %in% c('Steven', 'Thomas', 'Matthew')) %>%
  arrange(name, year) %>% data.frame %>% 
  ggplot(aes(x=year,y=number,col=name)) +
  geom_line()


某些男生名字的『比例』

mbaby %>%
  group_by(year) %>%
  mutate(year_total = sum(number)) %>%
  ungroup() %>%
  filter(name %in% c('Steven', 'Thomas', 'Matthew')) %>%
  mutate(fraction = number / year_total) %>%
  ggplot(aes(x=year,y=fraction,col=name)) +
  geom_line()


簡化程式

mbaby %>%
  group_by(year) %>%
  mutate(fraction = number / sum(number)) %>%
  filter(name %in% c('Steven', 'Thomas', 'Matthew')) %>%
  ggplot(aes(x=year,y=fraction,col=name)) + 
  geom_line()


熱門男生名字的『比例』

mtop = mbaby %>% group_by(year) %>% top_n(1, number) %>% 
  pull(name) %>% unique()

mbaby %>% group_by(year) %>%
  mutate(fraction = number / sum(number)) %>%
  filter(name %in% mtop) %>%
  ggplot(aes(x=year,y=fraction,col=name)) + 
  geom_line(alpha=0.5) + geom_point(size=0.5) -> g
ggplotly(g)  


熱門女生名字的『比例』

fbaby = babynames %>% 
  rename(number = n) %>% 
  filter(year %in% seq(1880, 2017, 5), sex=="F")

ftop = fbaby %>% group_by(year) %>% top_n(1, number) %>% 
  pull(name) %>% unique()

fbaby %>% group_by(year) %>%
  mutate(fraction = number / sum(number)) %>%
  filter(name %in% ftop) %>%
  ggplot(aes(x=year,y=fraction,col=name)) + 
  geom_line(alpha=0.5) + geom_point(size=0.5) -> g

ggplotly(g)  



💡 學習重點:
  ■ 每一份資料都是一個物件:名稱、值、種類、結構
  ■ 每一行程式都是一個運算式:物件、運算符號與功能的組合
  ■ 所謂寫程式:使用運算式定義新物件或改變既有物件的過程
  ■ dplyr套件:提供一套方便『整理資料』的…
    § 物件:tibble
    § 運算符號:%>%
    § 功能:
      。select: 依名稱選擇欄位
      。filter: 依條件選取紀錄
      。mutate: 運算新欄位
      。summarise: 欄位統計
      。group_by: 資料分群
      。…