SS & Sequence SQ ID’sB <- read_csv("data/TBN_CUST_BEHAVIOR.csv") %>% data.frameParsed with column specification:
cols(
CUST_NO = col_character(),
VISITDATE = col_integer(),
PAGE = col_character()
)
B$rid = 1:nrow(B)
B = B %>% arrange(VISITDATE, CUST_NO)
B$SS = group_indices(B, VISITDATE, CUST_NO)
B = group_by(B, SS) %>% mutate(SQ = row_number()) %>% data.framesubset(B, SS == 7) CUST_NO VISITDATE
9 -2HNCB-ABGPPLFIW 9448
10 -2HNCB-ABGPPLFIW 9448
11 -2HNCB-ABGPPLFIW 9448
12 -2HNCB-ABGPPLFIW 9448
13 -2HNCB-ABGPPLFIW 9448
14 -2HNCB-ABGPPLFIW 9448
15 -2HNCB-ABGPPLFIW 9448
16 -2HNCB-ABGPPLFIW 9448
PAGE
9 https://www.esunbank.com.tw/gygrt/e2c/ge/
10 https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/iougz/gzchdrjg-udtg-chdut
11 https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/iougz/cxuugrca-corygutgu
12 https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/iougz/cxuugrca-corygutgu
13 https://www.esunbank.com.tw/edrn/deoxt/drroxrcgmgrt/drroxrcgmgrt
14 https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/twf/fgposkt-cdqcxqdtkor
15 https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/iougkjr-sguykcg/fgopskt-gygrt
16 https://www.esunbank.com.tw/gygrt/iougz/mdungtkrj/puomotkor/
rid SS SQ
9 408098 7 1
10 408113 7 2
11 408121 7 3
12 408124 7 4
13 408126 7 5
14 408127 7 6
15 408130 7 7
16 408131 7 8
pg = table(B$PAGE) %>% sort(decreasing=T) %>% as.data.frame
rbind(head(pg), tail(pg)) Var1
1 https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/iougz/iougkjr-gzchdrjg-udtgs
2 https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/iougz/gzchdrjg-udtg-chdut
3 https://www.esunbank.com.tw/edrn/pgusordq
4 https://www.esunbank.com.tw/edrn/pgusordq/cugfkt-cduf/ugwduf/ugsxqt
5 https://www.esunbank.com.tw/edrn/pgusordq/cugfkt-cduf
6 https://www.esunbank.com.tw/edrn/pgusordq/cugfkt-cduf/fkscoxrt/shopkrio
777 https://www.esunbank.com.tw/gygrt/wgdqth/gpdpgu/20180108/
778 https://www.esunbank.com.tw/gygrt/wgdqth/gpdpgu/20180312/
779 https://www.esunbank.com.tw/gygrt/wgdqth/gpdpgu/20180416/
780 https://www.esunbank.com.tw/gygrt/wgdqth/mougqkcdqq_ixrf/
781 https://www.esunbank.com.tw/gygrt/wgedtm/pqxs/mdkr.htm
782 https://www.esunbank.com.tw/gygrt/wgedtm/tdoedo/kfgdqkig/
Freq
1 522355
2 363036
3 286790
4 70630
5 62093
6 52856
777 1
778 1
779 1
780 1
781 1
782 1
nrow(pg) # 782[1] 782
grepl("^http(s)?://www.esunbank.com.tw/", pg$Var1) %>% sum # 782[1] 782
gsub("^http(s)?://www.esunbank.com.tw/", "", pg$Var1) %>% n_distinct # 671[1] 671
Some URL start by http://
str_detect(B$PAGE, "^https://www.esunbank.com.tw/") %>% table.
FALSE TRUE
79165 2130699
Shorten the URL
B$HTTPS= str_detect(B$PAGE, "^https://www.esunbank.com.tw/")
B$PAGE = str_replace(B$PAGE, "^http(s)?://www.esunbank.com.tw/", "")💡: Sun-Burst Diagram 是網站流量資料最好的視覺化工具之一
pages = B$PAGE %>%
str_replace_all("-", "_") %>%
str_replace_all("/", "-") %>%
table %>% sort(decreasing=T) %>% as.data.frame
head(pages) . Freq
1 edrn-pgusordq-fgposkt-udtg-iougz-iougkjr_gzchdrjg_udtgs 522355
2 edrn-pgusordq-fgposkt-udtg-iougz-gzchdrjg_udtg_chdut 363036
3 edrn-pgusordq 286790
4 edrn-pgusordq-cugfkt_cduf-ugwduf-ugsxqt 70630
5 edrn-pgusordq-cugfkt_cduf 62093
6 gygrt-e2c-iougkjr- 55720
sunburst(subset(pages, Freq > 10))B$PAGE = str_replace(B$PAGE, "/$", "")
n_distinct(B$PAGE)[1] 659
p0 = par(cex=0.8, mar=c(5,25,3,2))
table(B$PAGE) %>% sort(decr=T) %>% head(20) %>% barplot(horiz=T, las=2)PAGES = table(B$PAGE) %>% sort(decr=T)sapply(0:10, function(i) sum(PAGES > i)) [1] 659 611 573 557 541 522 508 501 490 482 476
# 611 573 557 541 522 508 501 490 482 476B$PAGE = factor(B$PAGE, levels=names(PAGES))S = B %>%
mutate(pid = as.integer(PAGE)) %>%
group_by(SS) %>%
summarise(
npage = n(),
seq = paste0('P',pid,collapse="-")
) %>% ungroup() %>% data.framesave(B, S, file = "data/ys.rdata")table(S$npage) %>% sort %>% tail(10)
10 9 8 7 6 5 4 3 2 1
5619 7222 9341 12953 17876 26728 39133 66804 124505 318442
sapply(1:10, function(i) mean(S$npage > i)) %>% round(2) [1] 0.52 0.33 0.23 0.17 0.13 0.10 0.08 0.07 0.06 0.05
table(S$seq) %>% sort %>% tail(20)
P3-P5 P2-P2-P2-P2 P1-P1-P1 P3-P3-P3 P11 P16 P2-P2-P2
4282 4415 4775 4956 5762 7001 7764
P12 P9 P1-P1 P17 P5 P10 P7
8911 9036 9168 10277 11370 11804 11972
P6 P2-P2 P3-P3 P1 P2 P3
12169 15907 17926 19918 39399 78261
npage == 1)table(subset(S, npage == 1)$seq) %>% sort %>% tail(30)
P29 P49 P40 P35 P41 P64 P20 P38 P33 P50 P25 P44 P58 P15 P28
1347 1363 1431 1465 1511 1536 1553 1601 1706 1726 1808 1808 2017 2019 2689
P34 P36 P30 P11 P16 P12 P9 P17 P5 P10 P7 P6 P1 P2 P3
2811 2993 4174 5762 7001 8911 9036 10277 11370 11804 11972 12169 19918 39399 78261
npage > 1)page2 = table(subset(S, npage > 1)$seq) %>% sort sum(page2 > 50)[1] 562
sapply(seq(100,1000,100), function (i) sum(tail(page2, i))/sum(page2) ) [1] 0.4309 0.4940 0.5315 0.5561 0.5745 0.5895 0.6018 0.6124 0.6215 0.6294
data.frame(page2) %>% tail(100) %>% sunburst()data.frame(page2) %>% tail(300) %>% sunburst()