dplyr
)安裝、載入一些基本的套件
if(!require(dplyr)) install.packages("dplyr")
if(!require(tidyr)) install.packages("tidyr")
if(!require(ggplot2)) install.packages("ggplot2")
if(!require(plotly)) install.packages("plotly")
if(!require(babynames)) install.packages("babynames")
檢視資料
babynames
# A tibble: 1,924,665 x 5
year sex name n prop
<dbl> <chr> <chr> <int> <dbl>
1 1880 F Mary 7065 0.0724
2 1880 F Anna 2604 0.0267
3 1880 F Emma 2003 0.0205
4 1880 F Elizabeth 1939 0.0199
5 1880 F Minnie 1746 0.0179
6 1880 F Margaret 1578 0.0162
7 1880 F Ida 1472 0.0151
8 1880 F Alice 1414 0.0145
9 1880 F Bertha 1320 0.0135
10 1880 F Sarah 1288 0.0132
# ... with 1,924,655 more rows
篩選出男生資料
mbaby = babynames %>%
rename(number = n) %>%
filter(year %in% seq(1880, 2017, 5), sex=="M")
某些男生名字的『數量』
mbaby %>%
filter(name %in% c('Steven', 'Thomas', 'Matthew')) %>%
arrange(name, year) %>% data.frame %>%
ggplot(aes(x=year,y=number,col=name)) +
geom_line()
某些男生名字的『比例』
mbaby %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup() %>%
filter(name %in% c('Steven', 'Thomas', 'Matthew')) %>%
mutate(fraction = number / year_total) %>%
ggplot(aes(x=year,y=fraction,col=name)) +
geom_line()
簡化程式
mbaby %>%
group_by(year) %>%
mutate(fraction = number / sum(number)) %>%
filter(name %in% c('Steven', 'Thomas', 'Matthew')) %>%
ggplot(aes(x=year,y=fraction,col=name)) +
geom_line()
熱門男生名字的『比例』
mtop = mbaby %>% group_by(year) %>% top_n(1, number) %>%
pull(name) %>% unique()
mbaby %>% group_by(year) %>%
mutate(fraction = number / sum(number)) %>%
filter(name %in% mtop) %>%
ggplot(aes(x=year,y=fraction,col=name)) +
geom_line(alpha=0.5) + geom_point(size=0.5) -> g
ggplotly(g)
熱門女生名字的『比例』
fbaby = babynames %>%
rename(number = n) %>%
filter(year %in% seq(1880, 2017, 5), sex=="F")
ftop = fbaby %>% group_by(year) %>% top_n(1, number) %>%
pull(name) %>% unique()
fbaby %>% group_by(year) %>%
mutate(fraction = number / sum(number)) %>%
filter(name %in% ftop) %>%
ggplot(aes(x=year,y=fraction,col=name)) +
geom_line(alpha=0.5) + geom_point(size=0.5) -> g
ggplotly(g)
💡 學習重點:
■ 每一份資料都是一個
■ 每一行程式都是一個
■ 所謂寫
■ dplyr
§ 物件:tibble
§ 運算符號:%>%
§ 功能:
。select
: 依名稱選擇欄位
。filter
: 依條件選取紀錄
。mutate
: 運算新欄位
。summarise
: 欄位統計
。group_by
: 資料分群
。…