load("data/ys.rdata")

Four Kinds of Sequence

# take about 3 minutes
t0 = Sys.time()
S = B %>% 
  mutate(pid = as.integer(PAGE)) %>% 
  group_by(SS) %>% 
  summarise(
    npage = n(),
    seq = paste0('P',pid,collapse="-"),
    case = case_when(
      npage == 1 ~ "Bounce",
      sum(pid == first(pid)) == npage ~ "Repeat",
      npage < 5 ~ "Short",
      TRUE ~ "Long"
    )) %>% ungroup() %>% data.frame
S$case = factor(S$case)
save(B, S, file = "data/ys.rdata")
Sys.time() - t0
Time difference of 2.851 mins
table(S$case) %>% prop.table %>% sort(dec=T) %>% round(3)

Bounce Repeat  Short   Long 
 0.482  0.209  0.178  0.131 
L = lapply(levels(S$case), function(x) {
  df = table(S$seq[S$case == x]) %>% sort(dec=T) %>% data.frame
  df$Percent = 100 * df$Freq/sum(df$Freq)
  df$CumPcg = 100 * cumsum(df$Freq)/sum(df$Freq)
  df
  }); names(L) = levels(S$case)

Bounced Sequences (48.2%)

nrow(L$Bounce)
[1] 526
head(L$Bounce, 20)
   Var1  Freq Percent CumPcg
1    P3 78261 24.5762  24.58
2    P2 39399 12.3724  36.95
3    P1 19918  6.2548  43.20
4    P6 12169  3.8214  47.02
5    P7 11972  3.7596  50.78
6   P10 11804  3.7068  54.49
7    P5 11370  3.5705  58.06
8   P17 10277  3.2273  61.29
9    P9  9036  2.8376  64.13
10  P12  8911  2.7983  66.92
11  P16  7001  2.1985  69.12
12  P11  5762  1.8094  70.93
13  P30  4174  1.3108  72.24
14  P36  2993  0.9399  73.18
15  P34  2811  0.8827  74.07
16  P28  2689  0.8444  74.91
17  P15  2019  0.6340  75.54
18  P58  2017  0.6334  76.18
19  P25  1808  0.5678  76.75
20  P44  1808  0.5678  77.31

Repeating Sequences (20.9%)

nrow(L$Repeat)
[1] 1843
head(L$Repeat, 20)
                   Var1  Freq Percent CumPcg
1                 P3-P3 17926 13.0095  13.01
2                 P2-P2 15907 11.5442  24.55
3                 P1-P1  9168  6.6535  31.21
4              P2-P2-P2  7764  5.6346  36.84
5              P3-P3-P3  4956  3.5967  40.44
6              P1-P1-P1  4775  3.4654  43.90
7           P2-P2-P2-P2  4415  3.2041  47.11
8                 P7-P7  3340  2.4239  49.53
9           P1-P1-P1-P1  2942  2.1351  51.67
10       P2-P2-P2-P2-P2  2622  1.9029  53.57
11                P6-P6  2617  1.8992  55.47
12                P5-P5  2225  1.6148  57.08
13       P1-P1-P1-P1-P1  1861  1.3506  58.43
14          P3-P3-P3-P3  1763  1.2795  59.71
15    P2-P2-P2-P2-P2-P2  1748  1.2686  60.98
16              P10-P10  1688  1.2250  62.21
17              P17-P17  1499  1.0879  63.30
18    P1-P1-P1-P1-P1-P1  1294  0.9391  64.23
19 P2-P2-P2-P2-P2-P2-P2  1171  0.8498  65.08
20             P7-P7-P7  1134  0.8230  65.91

Short Sequences (17.8%)

nrow(L$Short)
[1] 23343
head(L$Short, 20)
          Var1 Freq Percent CumPcg
1        P3-P5 4282  3.6370  3.637
2        P1-P2 2070  1.7582  5.395
3     P9-P8-P8 2031  1.7250  7.120
4     P1-P1-P2 1675  1.4227  8.543
5        P3-P1 1537  1.3055  9.848
6        P6-P2 1534  1.3029 11.151
7     P3-P3-P5 1410  1.1976 12.349
8       P12-P7 1354  1.1500 13.499
9       P2-P11 1055  0.8961 14.395
10 P19-P19-P28  938  0.7967 15.192
11 P1-P1-P1-P2  891  0.7568 15.948
12   P3-P1-P13  864  0.7338 16.682
13      P9-P15  802  0.6812 17.363
14    P3-P5-P5  799  0.6786 18.042
15      P3-P32  748  0.6353 18.677
16     P12-P33  742  0.6302 19.308
17      P3-P20  728  0.6183 19.926
18     P35-P40  721  0.6124 20.538
19   P3-P3-P32  634  0.5385 21.077
20   P3-P3-P20  628  0.5334 21.610
L$Short$CumPcg[seq(100,500,100)]
[1] 39.96 48.22 52.88 56.13 58.64
Sun Burst on the 300 most freq short sequences (52.9%)
sunburst(L$Short[1:300,1:2])

Legend

Long Sequences (13.1%)

nrow(L$Long)
[1] 56161
head(L$Long, 20)
                         Var1 Freq Percent CumPcg
1              P1-P1-P1-P2-P2  595  0.6858 0.6858
2              P1-P1-P1-P1-P2  574  0.6616 1.3473
3            P9-P21-P21-P8-P8  548  0.6316 1.9789
4            P9-P8-P8-P21-P21  515  0.5936 2.5725
5           P1-P1-P1-P1-P1-P2  388  0.4472 3.0197
6           P1-P1-P1-P1-P2-P2  330  0.3803 3.4000
7            P3-P3-P1-P13-P13  324  0.3734 3.7734
8        P1-P1-P1-P1-P1-P1-P2  294  0.3388 4.1123
9              P9-P8-P8-P8-P8  262  0.3020 4.4142
10       P1-P1-P1-P1-P1-P2-P2  256  0.2950 4.7093
11       P1-P1-P1-P1-P2-P2-P2  236  0.2720 4.9813
12    P1-P1-P1-P1-P1-P1-P2-P2  217  0.2501 5.2314
13            P2-P2-P2-P2-P11  215  0.2478 5.4792
14    P1-P1-P1-P1-P1-P1-P1-P2  208  0.2397 5.7189
15          P1-P1-P1-P2-P2-P2  203  0.2340 5.9529
16             P1-P1-P2-P2-P2  188  0.2167 6.1695
17             P3-P3-P3-P5-P5  171  0.1971 6.3666
18 P1-P1-P1-P1-P1-P1-P1-P1-P2  169  0.1948 6.5614
19    P1-P1-P1-P1-P1-P2-P2-P2  166  0.1913 6.7527
20             P6-P2-P2-P2-P2  157  0.1809 6.9337
L$Long$CumPcg[seq(100,1000,100)]
 [1] 13.86 17.60 20.03 21.78 23.14 24.31 25.32 26.21 27.01 27.70
Sun Burst on the 700 most freq long sequences (25.3%)
sunburst(L$Short[1:700,1:2])
Legend