單元摘要: 期中專案(OLIST)資料探索

pacman::p_load(dplyr, MASS, dendextend, vegan, randomcoloR, googleVis, d3heatmap)

💡 請注意資料是放在上星期的資料夾裡面

load("../unit09/data/olist.rdata")
load("../unit09/data/Z.rdata")



【A】利用集群分析與尺度縮減的應用:產品(大項)分類

A1. Pull product_category_name_english into P
P = left_join(P, TPC) %>% rename(category = product_category_name_english)
Joining, by = "product_category_name"
A2. Merge category into I
I = left_join(I, P[,c(1:10)])
Joining, by = "product_id"
A3. Make a Category_Seller Binary Matrix: mx
mx = xtabs(~ category + seller_id, I) > 0
dim(mx)
[1]   71 3033
A4. Do Clusteing on Categories
dx= dist(mx, "binary")
hcx = hclust(dx, method="ward.D2")
par(mar=c(3,3,1,15), cex=0.8)
dend = as.dendrogram(hcx)  # horizontal dendrogram
plot(dend,horiz=T)

A5. Cut tree and make a nice dendrogram
k = 14; cols = distinctColorPalette(k)
kg=cutree(hcx, k)
dend %>% color_branches(k, col=cols) %>% color_labels(k) %>% plot(horiz=TRUE, lwd=2)
dend %>% rect.dendrogram(k, horiz=TRUE, border="#C0C0C018", col='#C0C0C018')

# abline(v=heights_per_k.dendrogram(dend)[as.character(k)]-0.03, lty=2, col="blue")
A6. Dimension Reduction by MDS - Multi-Dimension Scaling
# There are many knids of MDS
# mdx = cmdscale(dx, eig=T)  # classical MDS
# mdx = isoMDS(dx)           # iso MDS
mdx = metaMDS(dx, k=2)       # Nonmetric MDS
Run 0 stress 0.17852 
Run 1 stress 0.17902 
... Procrustes: rmse 0.093425  max resid 0.3105 
Run 2 stress 0.17322 
... New best solution
... Procrustes: rmse 0.094434  max resid 0.24645 
Run 3 stress 0.17939 
Run 4 stress 0.17059 
... New best solution
... Procrustes: rmse 0.08428  max resid 0.27011 
Run 5 stress 0.17626 
Run 6 stress 0.18043 
Run 7 stress 0.17441 
Run 8 stress 0.17975 
Run 9 stress 0.16916 
... New best solution
... Procrustes: rmse 0.071537  max resid 0.32916 
Run 10 stress 0.18075 
Run 11 stress 0.17109 
Run 12 stress 0.18108 
Run 13 stress 0.18686 
Run 14 stress 0.17462 
Run 15 stress 0.17691 
Run 16 stress 0.18479 
Run 17 stress 0.18063 
Run 18 stress 0.17942 
Run 19 stress 0.18093 
Run 20 stress 0.18434 
*** No convergence -- monoMDS stopping criteria:
     6: no. of iterations >= maxit
    14: stress ratio > sratmax
A7. Plot the Result as a Word Cloud
x = mdx$points[,1]; y = mdx$points[,2] 
par(mar=c(4,4,4,2), cex=0.65)
plot(x, y, xlab="Dim1", ylab="Dim2", main="MDS", type="n")
text(x, y, labels=row.names(mx), font=2, col=cols[kg])


💡 程式語言可以幫助我們整理資料之外,也可以幫助我們連接各種不同的分析方法,某一些方法只有在與其他方法一起使用的時候,才能發揮它最大的效用。




【B】靜態與動態泡泡圖,品類之間的比較

B1. 對品類(category)做彙總
category = filter(I, !is.na(category)) %>% 
  group_by(category) %>% summarise(
    itemsSold = n(),
    totalRev = sum(price),
    avgPrice = mean(price),
    noProduct = n_distinct(product_id),
    noSeller = n_distinct(seller_id),
    dummy = 2018
  ) %>% arrange(desc(totalRev))
B2. 總營收最大的20個品類
top20 = category$category[1:20]
category[1:20,]
# A tibble: 20 x 7
   category              itemsSold totalRev avgPrice noProduct noSeller dummy
   <chr>                     <int>    <dbl>    <dbl>     <int>    <int> <dbl>
 1 health_beauty              9670 1258681.    130.       2444      492  2018
 2 watches_gifts              5991 1205006.    201.       1329      101  2018
 3 bed_bath_table            11115 1036989.     93.3      3029      196  2018
 4 sports_leisure             8641  988049.    114.       2867      481  2018
 5 computers_accessories      7827  911954.    117.       1639      287  2018
 6 furniture_decor            8334  729762.     87.6      2657      370  2018
 7 cool_stuff                 3796  635291.    167.        789      267  2018
 8 housewares                 6964  632249.     90.8      2335      468  2018
 9 auto                       4235  592720.    140.       1900      383  2018
10 garden_tools               4347  485256.    112.        753      237  2018
11 toys                       4117  483947.    118.       1411      252  2018
12 baby                       3065  411765.    134.        919      244  2018
13 perfumery                  3419  399125.    117.        868      175  2018
14 telephony                  4545  323668.     71.2      1134      149  2018
15 office_furniture           1691  273961.    162.        309       34  2018
16 stationery                 2517  230943.     91.8       849      173  2018
17 computers                   203  222963.   1098.         30        9  2018
18 pet_shop                   1947  214315.    110.        719      137  2018
19 musical_instruments         680  191499.    282.        289       70  2018
20 small_appliances            679  190649.    281.        231      105  2018
B3. 靜態泡泡圖
op = options(gvis.plot.tag='chart')
plot( gvisMotionChart(
  category, "category", "dummy", 
  options=list(width=800, height=600) ))
B4. 對品類(category)和季別(quarter)做彙總
X = left_join(O[, c(1,4)], R[,2:3]) %>%     # pull score & timestamp into 'O'
  rename(
    time = order_purchase_timestamp, 
    score = review_score) %>% 
  mutate(                                   # cut timestamp into quarter    
    quarter = as.Date(cut(time, "quarter"))
    ) %>%  
  right_join(I) %>%                         # merge score & quarter into 'I'
  filter(category %in% top20) %>%           # pick out the top20 categories
  group_by(category, quarter) %>% 
  summarise(                            # summarise by category & quarter
    itemsSold = n(),                     
    totalRev = sum(price),
    avgPrice = mean(price),
    avgScore = mean(score),
    noProduct = n_distinct(product_id),
    noSeller = n_distinct(seller_id)
  ) %>% 
  arrange(category, quarter)            # order by category & quarter
Joining, by = "order_id"
Joining, by = "order_id"
B5. Some Adjustment before Ploting
X2 = X %>% 
  filter(quarter >= as.Date("2017-04-01")) %>% 
  filter(!(category %in% c("computers", "office_furniture"))) %>% 
  mutate(Score = pmax(avgScore, 3)) %>% as.data.frame
B6. 動態泡泡圖
plot( gvisMotionChart( 
  X2, "category", "quarter", 
  options=list(width=800, height=600) ))


💡 動態泡泡圖不但可以讓我們依各種軸度比較研究對象的之間的差異,它也可以幫助我們看到個體與整體的變化趨勢。




【C】產業經濟應用

C1. 準備資料
X = filter(I, !is.na(category)) %>% 
  group_by(category, seller_id) %>%     # cascading groups
  summarise(revenue = sum(price)) %>%   # drop last grouping
  arrange(category, desc(revenue)) %>%  # arrange and ...
  mutate(                               # mutate within cetegory
    rn = row_number(desc(revenue)),  # rank by revenue
    share = revenue/sum(revenue),    # market share 
    c.share = cumsum(share)          # cumm. market share
  )
C2. 計算產業集中度
category = X %>% group_by(category) %>% 
  summarise(
    concentrate = sum(share^2),
    top3.con = max(c.share[rn <= 3]),
    top5.con = max(c.share[rn <= 5]),
    top10.con = max(c.share[rn <= 10])
  ) %>% 
  right_join(category) %>% 
  arrange(desc(concentrate))
Joining, by = "category"
filter(category, totalRev > 100000)[,c(1:4,7,10)]
# A tibble: 26 x 6
   category            concentrate top3.con top5.con totalRev noSeller
   <chr>                     <dbl>    <dbl>    <dbl>    <dbl>    <int>
 1 computers                0.580     0.917    0.968  222963.        9
 2 office_furniture         0.423     0.815    0.874  273961.       34
 3 home_appliances_2        0.141     0.499    0.583  113318.       46
 4 stationery               0.134     0.466    0.536  230943.      173
 5 watches_gifts            0.0931    0.467    0.626 1205006.      101
 6 luggage_accessories      0.0920    0.430    0.550  140430.       73
 7 musical_instruments      0.0916    0.476    0.564  191499.       70
 8 consoles_games           0.0832    0.395    0.506  157465.       82
 9 perfumery                0.0828    0.446    0.552  399125.      175
10 garden_tools             0.0709    0.398    0.471  485256.      237
# … with 16 more rows



【D】熱圖與集群分析的綜合應用

D1. 每一州的顧客比率
table(C$customer_state) %>% 
  sort(decreasing=T) %>% 
  prop.table %>% 
  cumsum
     SP      RJ      MG      RS      PR      SC      BA      DF      ES      GO 
0.41981 0.54905 0.66605 0.72102 0.77175 0.80833 0.84232 0.86384 0.88428 0.90460 
     PE      CE      PA      MT      MA      MS      PB      PI      RN      AL 
0.92121 0.93464 0.94445 0.95357 0.96108 0.96827 0.97366 0.97864 0.98352 0.98767 
     SE      TO      RO      AM      AC      AP      RR 
0.99119 0.99401 0.99655 0.99804 0.99885 0.99954 1.00000 
D2. Merge C$customer_state into I via O
I = left_join(O[,1:2], C[,c(1,5)])[-2] %>%   # merge state into `O`
  right_join(I) %>%                          # then merge to `I`
  rename(state=customer_state)               # use a shoter name
Joining, by = "customer_id"
Joining, by = "order_id"
D3. Make a Category_State Matrix

mx[c,s] is the number of c product items sold to s

mx = xtabs(~ category + state, I)         # count the no. item sold
# If we want to use total revenue instead of counts, simply do
# mx = xtabs(price ~ category + state, I)  
dim(mx)  # 71 categories by 27 states
[1] 71 27

The data is heavily skewed

hist(mx, main=range(mx))

use 1+log transformation and reverse color scheme

-log(1+mx) %>% as.data.frame.matrix %>% d3heatmap()


💡 除了用顏色表現矩陣裡面的數字,熱圖可以幫我們在矩陣的兩個方向分別做層級式集群分析,幫助我們看到資料裡面的結構。