Four Kinds of Sequence
# take about 3 minutes
t0 = Sys.time()
S = B %>%
mutate(pid = as.integer(PAGE)) %>%
group_by(SS) %>%
summarise(
npage = n(),
seq = paste0('P',pid,collapse="-"),
case = case_when(
npage == 1 ~ "Bounce",
sum(pid == first(pid)) == npage ~ "Repeat",
npage < 5 ~ "Short",
TRUE ~ "Long"
)) %>% ungroup() %>% data.frame
S$case = factor(S$case)
save(B, S, file = "data/ys.rdata")
Sys.time() - t0
Time difference of 2.851 mins
table(S$case) %>% prop.table %>% sort(dec=T) %>% round(3)
Bounce Repeat Short Long
0.482 0.209 0.178 0.131
L = lapply(levels(S$case), function(x) {
df = table(S$seq[S$case == x]) %>% sort(dec=T) %>% data.frame
df$Percent = 100 * df$Freq/sum(df$Freq)
df$CumPcg = 100 * cumsum(df$Freq)/sum(df$Freq)
df
}); names(L) = levels(S$case)
Bounced Sequences (48.2%)
[1] 526
Var1 Freq Percent CumPcg
1 P3 78261 24.5762 24.58
2 P2 39399 12.3724 36.95
3 P1 19918 6.2548 43.20
4 P6 12169 3.8214 47.02
5 P7 11972 3.7596 50.78
6 P10 11804 3.7068 54.49
7 P5 11370 3.5705 58.06
8 P17 10277 3.2273 61.29
9 P9 9036 2.8376 64.13
10 P12 8911 2.7983 66.92
11 P16 7001 2.1985 69.12
12 P11 5762 1.8094 70.93
13 P30 4174 1.3108 72.24
14 P36 2993 0.9399 73.18
15 P34 2811 0.8827 74.07
16 P28 2689 0.8444 74.91
17 P15 2019 0.6340 75.54
18 P58 2017 0.6334 76.18
19 P25 1808 0.5678 76.75
20 P44 1808 0.5678 77.31
Repeating Sequences (20.9%)
[1] 1843
Var1 Freq Percent CumPcg
1 P3-P3 17926 13.0095 13.01
2 P2-P2 15907 11.5442 24.55
3 P1-P1 9168 6.6535 31.21
4 P2-P2-P2 7764 5.6346 36.84
5 P3-P3-P3 4956 3.5967 40.44
6 P1-P1-P1 4775 3.4654 43.90
7 P2-P2-P2-P2 4415 3.2041 47.11
8 P7-P7 3340 2.4239 49.53
9 P1-P1-P1-P1 2942 2.1351 51.67
10 P2-P2-P2-P2-P2 2622 1.9029 53.57
11 P6-P6 2617 1.8992 55.47
12 P5-P5 2225 1.6148 57.08
13 P1-P1-P1-P1-P1 1861 1.3506 58.43
14 P3-P3-P3-P3 1763 1.2795 59.71
15 P2-P2-P2-P2-P2-P2 1748 1.2686 60.98
16 P10-P10 1688 1.2250 62.21
17 P17-P17 1499 1.0879 63.30
18 P1-P1-P1-P1-P1-P1 1294 0.9391 64.23
19 P2-P2-P2-P2-P2-P2-P2 1171 0.8498 65.08
20 P7-P7-P7 1134 0.8230 65.91
Short Sequences (17.8%)
[1] 23343
Var1 Freq Percent CumPcg
1 P3-P5 4282 3.6370 3.637
2 P1-P2 2070 1.7582 5.395
3 P9-P8-P8 2031 1.7250 7.120
4 P1-P1-P2 1675 1.4227 8.543
5 P3-P1 1537 1.3055 9.848
6 P6-P2 1534 1.3029 11.151
7 P3-P3-P5 1410 1.1976 12.349
8 P12-P7 1354 1.1500 13.499
9 P2-P11 1055 0.8961 14.395
10 P19-P19-P28 938 0.7967 15.192
11 P1-P1-P1-P2 891 0.7568 15.948
12 P3-P1-P13 864 0.7338 16.682
13 P9-P15 802 0.6812 17.363
14 P3-P5-P5 799 0.6786 18.042
15 P3-P32 748 0.6353 18.677
16 P12-P33 742 0.6302 19.308
17 P3-P20 728 0.6183 19.926
18 P35-P40 721 0.6124 20.538
19 P3-P3-P32 634 0.5385 21.077
20 P3-P3-P20 628 0.5334 21.610
L$Short$CumPcg[seq(100,500,100)]
[1] 39.96 48.22 52.88 56.13 58.64
Sun Burst on the 300 most freq short sequences (52.9%)
sunburst(L$Short[1:300,1:2])
Long Sequences (13.1%)
[1] 56161
Var1 Freq Percent CumPcg
1 P1-P1-P1-P2-P2 595 0.6858 0.6858
2 P1-P1-P1-P1-P2 574 0.6616 1.3473
3 P9-P21-P21-P8-P8 548 0.6316 1.9789
4 P9-P8-P8-P21-P21 515 0.5936 2.5725
5 P1-P1-P1-P1-P1-P2 388 0.4472 3.0197
6 P1-P1-P1-P1-P2-P2 330 0.3803 3.4000
7 P3-P3-P1-P13-P13 324 0.3734 3.7734
8 P1-P1-P1-P1-P1-P1-P2 294 0.3388 4.1123
9 P9-P8-P8-P8-P8 262 0.3020 4.4142
10 P1-P1-P1-P1-P1-P2-P2 256 0.2950 4.7093
11 P1-P1-P1-P1-P2-P2-P2 236 0.2720 4.9813
12 P1-P1-P1-P1-P1-P1-P2-P2 217 0.2501 5.2314
13 P2-P2-P2-P2-P11 215 0.2478 5.4792
14 P1-P1-P1-P1-P1-P1-P1-P2 208 0.2397 5.7189
15 P1-P1-P1-P2-P2-P2 203 0.2340 5.9529
16 P1-P1-P2-P2-P2 188 0.2167 6.1695
17 P3-P3-P3-P5-P5 171 0.1971 6.3666
18 P1-P1-P1-P1-P1-P1-P1-P1-P2 169 0.1948 6.5614
19 P1-P1-P1-P1-P1-P2-P2-P2 166 0.1913 6.7527
20 P6-P2-P2-P2-P2 157 0.1809 6.9337
L$Long$CumPcg[seq(100,1000,100)]
[1] 13.86 17.60 20.03 21.78 23.14 24.31 25.32 26.21 27.01 27.70
Sun Burst on the 700 most freq long sequences (25.3%)
sunburst(L$Short[1:700,1:2])