Make Session SS & Sequence SQ ID’s

B <- read_csv("data/TBN_CUST_BEHAVIOR.csv") %>% data.frame
Parsed with column specification:
cols(
  CUST_NO = col_character(),
  VISITDATE = col_integer(),
  PAGE = col_character()
)
B$rid = 1:nrow(B)
B = B %>% arrange(VISITDATE, CUST_NO)
B$SS = group_indices(B, VISITDATE, CUST_NO)
B = group_by(B, SS) %>% mutate(SQ = row_number()) %>% data.frame
subset(B, SS == 7)
            CUST_NO VISITDATE
9  -2HNCB-ABGPPLFIW      9448
10 -2HNCB-ABGPPLFIW      9448
11 -2HNCB-ABGPPLFIW      9448
12 -2HNCB-ABGPPLFIW      9448
13 -2HNCB-ABGPPLFIW      9448
14 -2HNCB-ABGPPLFIW      9448
15 -2HNCB-ABGPPLFIW      9448
16 -2HNCB-ABGPPLFIW      9448
                                                                               PAGE
9                                         https://www.esunbank.com.tw/gygrt/e2c/ge/
10 https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/iougz/gzchdrjg-udtg-chdut
11  https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/iougz/cxuugrca-corygutgu
12  https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/iougz/cxuugrca-corygutgu
13                 https://www.esunbank.com.tw/edrn/deoxt/drroxrcgmgrt/drroxrcgmgrt
14   https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/twf/fgposkt-cdqcxqdtkor
15  https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/iougkjr-sguykcg/fgopskt-gygrt
16                     https://www.esunbank.com.tw/gygrt/iougz/mdungtkrj/puomotkor/
      rid SS SQ
9  408098  7  1
10 408113  7  2
11 408121  7  3
12 408124  7  4
13 408126  7  5
14 408127  7  6
15 408130  7  7
16 408131  7  8

Examine the URL’s

pg = table(B$PAGE) %>% sort(decreasing=T) %>% as.data.frame
rbind(head(pg), tail(pg))
                                                                                   Var1
1   https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/iougz/iougkjr-gzchdrjg-udtgs
2      https://www.esunbank.com.tw/edrn/pgusordq/fgposkt/udtg/iougz/gzchdrjg-udtg-chdut
3                                             https://www.esunbank.com.tw/edrn/pgusordq
4                   https://www.esunbank.com.tw/edrn/pgusordq/cugfkt-cduf/ugwduf/ugsxqt
5                                 https://www.esunbank.com.tw/edrn/pgusordq/cugfkt-cduf
6               https://www.esunbank.com.tw/edrn/pgusordq/cugfkt-cduf/fkscoxrt/shopkrio
777                           https://www.esunbank.com.tw/gygrt/wgdqth/gpdpgu/20180108/
778                           https://www.esunbank.com.tw/gygrt/wgdqth/gpdpgu/20180312/
779                           https://www.esunbank.com.tw/gygrt/wgdqth/gpdpgu/20180416/
780                           https://www.esunbank.com.tw/gygrt/wgdqth/mougqkcdqq_ixrf/
781                              https://www.esunbank.com.tw/gygrt/wgedtm/pqxs/mdkr.htm
782                           https://www.esunbank.com.tw/gygrt/wgedtm/tdoedo/kfgdqkig/
      Freq
1   522355
2   363036
3   286790
4    70630
5    62093
6    52856
777      1
778      1
779      1
780      1
781      1
782      1
nrow(pg)                                                             # 782
[1] 782
grepl("^http(s)?://www.esunbank.com.tw/", pg$Var1) %>% sum           # 782
[1] 782
gsub("^http(s)?://www.esunbank.com.tw/", "", pg$Var1) %>% n_distinct # 671
[1] 671

Some URL start by http://

str_detect(B$PAGE, "^https://www.esunbank.com.tw/") %>% table
.
  FALSE    TRUE 
  79165 2130699 

Shorten the URL

B$HTTPS= str_detect(B$PAGE, "^https://www.esunbank.com.tw/")
B$PAGE = str_replace(B$PAGE, "^http(s)?://www.esunbank.com.tw/", "")

Sun-Burst : URL Hierarchy

💡: Sun-Burst Diagram 是網站流量資料最好的視覺化工具之一

pages = B$PAGE %>% 
  str_replace_all("-", "_") %>% 
  str_replace_all("/", "-") %>% 
  table %>% sort(decreasing=T) %>% as.data.frame
head(pages)
                                                        .   Freq
1 edrn-pgusordq-fgposkt-udtg-iougz-iougkjr_gzchdrjg_udtgs 522355
2    edrn-pgusordq-fgposkt-udtg-iougz-gzchdrjg_udtg_chdut 363036
3                                           edrn-pgusordq 286790
4                 edrn-pgusordq-cugfkt_cduf-ugwduf-ugsxqt  70630
5                               edrn-pgusordq-cugfkt_cduf  62093
6                                      gygrt-e2c-iougkjr-  55720
sunburst(subset(pages, Freq > 10))

Legend
<

Working on Session & Sequence

B$PAGE = str_replace(B$PAGE, "/$", "")
n_distinct(B$PAGE)
[1] 659
p0 = par(cex=0.8, mar=c(5,25,3,2))
table(B$PAGE) %>% sort(decr=T) %>% head(20) %>% barplot(horiz=T, las=2)

PAGES = table(B$PAGE) %>% sort(decr=T)
sapply(0:10, function(i) sum(PAGES > i))
 [1] 659 611 573 557 541 522 508 501 490 482 476
# 611 573 557 541 522 508 501 490 482 476
B$PAGE =  factor(B$PAGE, levels=names(PAGES))
S = B %>% 
  mutate(pid = as.integer(PAGE)) %>% 
  group_by(SS) %>% 
  summarise(
    npage = n(),
    seq = paste0('P',pid,collapse="-")
    ) %>% ungroup() %>% data.frame
save(B, S, file = "data/ys.rdata")
No. pages in the sessions
table(S$npage) %>% sort %>% tail(10)

    10      9      8      7      6      5      4      3      2      1 
  5619   7222   9341  12953  17876  26728  39133  66804 124505 318442 
sapply(1:10, function(i) mean(S$npage > i)) %>% round(2)
 [1] 0.52 0.33 0.23 0.17 0.13 0.10 0.08 0.07 0.06 0.05
The most frequent sequence
table(S$seq) %>% sort %>% tail(20)

      P3-P5 P2-P2-P2-P2    P1-P1-P1    P3-P3-P3         P11         P16    P2-P2-P2 
       4282        4415        4775        4956        5762        7001        7764 
        P12          P9       P1-P1         P17          P5         P10          P7 
       8911        9036        9168       10277       11370       11804       11972 
         P6       P2-P2       P3-P3          P1          P2          P3 
      12169       15907       17926       19918       39399       78261 

Bounced Sequences (npage == 1)

table(subset(S, npage == 1)$seq) %>% sort %>% tail(30)

  P29   P49   P40   P35   P41   P64   P20   P38   P33   P50   P25   P44   P58   P15   P28 
 1347  1363  1431  1465  1511  1536  1553  1601  1706  1726  1808  1808  2017  2019  2689 
  P34   P36   P30   P11   P16   P12    P9   P17    P5   P10    P7    P6    P1    P2    P3 
 2811  2993  4174  5762  7001  8911  9036 10277 11370 11804 11972 12169 19918 39399 78261 

Page Sequences (npage > 1)

page2 = table(subset(S, npage > 1)$seq) %>% sort 
sum(page2 > 50)
[1] 562
sapply(seq(100,1000,100), function (i) sum(tail(page2, i))/sum(page2) )
 [1] 0.4309 0.4940 0.5315 0.5561 0.5745 0.5895 0.6018 0.6124 0.6215 0.6294
The 100 most freq sequences (43.1%)
data.frame(page2) %>% tail(100) %>% sunburst()

Legend
<

The 200 most freq sequences (53.2%)
data.frame(page2) %>% tail(300) %>% sunburst()

Legend
<