單元摘要: 期中專案(OLIST)資料探索
pacman::p_load(dplyr, MASS, dendextend, vegan, randomcoloR, googleVis, d3heatmap)
💡 請注意資料是放在上星期的資料夾裡面
load("../unit09/data/olist.rdata")
load("../unit09/data/Z.rdata")
product_category_name_english
into P
P = left_join(P, TPC) %>% rename(category = product_category_name_english)
Joining, by = "product_category_name"
category
into I
I = left_join(I, P[,c(1:10)])
Joining, by = "product_id"
mx
mx = xtabs(~ category + seller_id, I) > 0
dim(mx)
[1] 71 3033
dx= dist(mx, "binary")
hcx = hclust(dx, method="ward.D2")
par(mar=c(3,3,1,15), cex=0.8)
dend = as.dendrogram(hcx) # horizontal dendrogram
plot(dend,horiz=T)
k = 14; cols = distinctColorPalette(k)
kg=cutree(hcx, k)
dend %>% color_branches(k, col=cols) %>% color_labels(k) %>% plot(horiz=TRUE, lwd=2)
dend %>% rect.dendrogram(k, horiz=TRUE, border="#C0C0C018", col='#C0C0C018')
# abline(v=heights_per_k.dendrogram(dend)[as.character(k)]-0.03, lty=2, col="blue")
# There are many knids of MDS
# mdx = cmdscale(dx, eig=T) # classical MDS
# mdx = isoMDS(dx) # iso MDS
mdx = metaMDS(dx, k=2) # Nonmetric MDS
Run 0 stress 0.17852
Run 1 stress 0.17902
... Procrustes: rmse 0.093425 max resid 0.3105
Run 2 stress 0.17322
... New best solution
... Procrustes: rmse 0.094434 max resid 0.24645
Run 3 stress 0.17939
Run 4 stress 0.17059
... New best solution
... Procrustes: rmse 0.08428 max resid 0.27011
Run 5 stress 0.17626
Run 6 stress 0.18043
Run 7 stress 0.17441
Run 8 stress 0.17975
Run 9 stress 0.16916
... New best solution
... Procrustes: rmse 0.071537 max resid 0.32916
Run 10 stress 0.18075
Run 11 stress 0.17109
Run 12 stress 0.18108
Run 13 stress 0.18686
Run 14 stress 0.17462
Run 15 stress 0.17691
Run 16 stress 0.18479
Run 17 stress 0.18063
Run 18 stress 0.17942
Run 19 stress 0.18093
Run 20 stress 0.18434
*** No convergence -- monoMDS stopping criteria:
6: no. of iterations >= maxit
14: stress ratio > sratmax
x = mdx$points[,1]; y = mdx$points[,2]
par(mar=c(4,4,4,2), cex=0.65)
plot(x, y, xlab="Dim1", ylab="Dim2", main="MDS", type="n")
text(x, y, labels=row.names(mx), font=2, col=cols[kg])
💡 程式語言可以幫助我們整理資料之外,也可以幫助我們連接各種不同的分析方法,某一些方法只有在與其他方法一起使用的時候,才能發揮它最大的效用。
category
)做彙總category = filter(I, !is.na(category)) %>%
group_by(category) %>% summarise(
itemsSold = n(),
totalRev = sum(price),
avgPrice = mean(price),
noProduct = n_distinct(product_id),
noSeller = n_distinct(seller_id),
dummy = 2018
) %>% arrange(desc(totalRev))
top20 = category$category[1:20]
category[1:20,]
# A tibble: 20 x 7
category itemsSold totalRev avgPrice noProduct noSeller dummy
<chr> <int> <dbl> <dbl> <int> <int> <dbl>
1 health_beauty 9670 1258681. 130. 2444 492 2018
2 watches_gifts 5991 1205006. 201. 1329 101 2018
3 bed_bath_table 11115 1036989. 93.3 3029 196 2018
4 sports_leisure 8641 988049. 114. 2867 481 2018
5 computers_accessories 7827 911954. 117. 1639 287 2018
6 furniture_decor 8334 729762. 87.6 2657 370 2018
7 cool_stuff 3796 635291. 167. 789 267 2018
8 housewares 6964 632249. 90.8 2335 468 2018
9 auto 4235 592720. 140. 1900 383 2018
10 garden_tools 4347 485256. 112. 753 237 2018
11 toys 4117 483947. 118. 1411 252 2018
12 baby 3065 411765. 134. 919 244 2018
13 perfumery 3419 399125. 117. 868 175 2018
14 telephony 4545 323668. 71.2 1134 149 2018
15 office_furniture 1691 273961. 162. 309 34 2018
16 stationery 2517 230943. 91.8 849 173 2018
17 computers 203 222963. 1098. 30 9 2018
18 pet_shop 1947 214315. 110. 719 137 2018
19 musical_instruments 680 191499. 282. 289 70 2018
20 small_appliances 679 190649. 281. 231 105 2018
op = options(gvis.plot.tag='chart')
plot( gvisMotionChart(
category, "category", "dummy",
options=list(width=800, height=600) ))
category
)和季別(quarter
)做彙總X = left_join(O[, c(1,4)], R[,2:3]) %>% # pull score & timestamp into 'O'
rename(
time = order_purchase_timestamp,
score = review_score) %>%
mutate( # cut timestamp into quarter
quarter = as.Date(cut(time, "quarter"))
) %>%
right_join(I) %>% # merge score & quarter into 'I'
filter(category %in% top20) %>% # pick out the top20 categories
group_by(category, quarter) %>%
summarise( # summarise by category & quarter
itemsSold = n(),
totalRev = sum(price),
avgPrice = mean(price),
avgScore = mean(score),
noProduct = n_distinct(product_id),
noSeller = n_distinct(seller_id)
) %>%
arrange(category, quarter) # order by category & quarter
Joining, by = "order_id"
Joining, by = "order_id"
X2 = X %>%
filter(quarter >= as.Date("2017-04-01")) %>%
filter(!(category %in% c("computers", "office_furniture"))) %>%
mutate(Score = pmax(avgScore, 3)) %>% as.data.frame
plot( gvisMotionChart(
X2, "category", "quarter",
options=list(width=800, height=600) ))
💡 動態泡泡圖不但可以讓我們依各種軸度比較研究對象的之間的差異,它也可以幫助我們看到個體與整體的變化趨勢。
X = filter(I, !is.na(category)) %>%
group_by(category, seller_id) %>% # cascading groups
summarise(revenue = sum(price)) %>% # drop last grouping
arrange(category, desc(revenue)) %>% # arrange and ...
mutate( # mutate within cetegory
rn = row_number(desc(revenue)), # rank by revenue
share = revenue/sum(revenue), # market share
c.share = cumsum(share) # cumm. market share
)
category = X %>% group_by(category) %>%
summarise(
concentrate = sum(share^2),
top3.con = max(c.share[rn <= 3]),
top5.con = max(c.share[rn <= 5]),
top10.con = max(c.share[rn <= 10])
) %>%
right_join(category) %>%
arrange(desc(concentrate))
Joining, by = "category"
filter(category, totalRev > 100000)[,c(1:4,7,10)]
# A tibble: 26 x 6
category concentrate top3.con top5.con totalRev noSeller
<chr> <dbl> <dbl> <dbl> <dbl> <int>
1 computers 0.580 0.917 0.968 222963. 9
2 office_furniture 0.423 0.815 0.874 273961. 34
3 home_appliances_2 0.141 0.499 0.583 113318. 46
4 stationery 0.134 0.466 0.536 230943. 173
5 watches_gifts 0.0931 0.467 0.626 1205006. 101
6 luggage_accessories 0.0920 0.430 0.550 140430. 73
7 musical_instruments 0.0916 0.476 0.564 191499. 70
8 consoles_games 0.0832 0.395 0.506 157465. 82
9 perfumery 0.0828 0.446 0.552 399125. 175
10 garden_tools 0.0709 0.398 0.471 485256. 237
# … with 16 more rows
table(C$customer_state) %>%
sort(decreasing=T) %>%
prop.table %>%
cumsum
SP RJ MG RS PR SC BA DF ES GO
0.41981 0.54905 0.66605 0.72102 0.77175 0.80833 0.84232 0.86384 0.88428 0.90460
PE CE PA MT MA MS PB PI RN AL
0.92121 0.93464 0.94445 0.95357 0.96108 0.96827 0.97366 0.97864 0.98352 0.98767
SE TO RO AM AC AP RR
0.99119 0.99401 0.99655 0.99804 0.99885 0.99954 1.00000
C$customer_state
into I
via O
I = left_join(O[,1:2], C[,c(1,5)])[-2] %>% # merge state into `O`
right_join(I) %>% # then merge to `I`
rename(state=customer_state) # use a shoter name
Joining, by = "customer_id"
Joining, by = "order_id"
mx[c,s]
is the number of c
product items sold to s
mx = xtabs(~ category + state, I) # count the no. item sold
# If we want to use total revenue instead of counts, simply do
# mx = xtabs(price ~ category + state, I)
dim(mx) # 71 categories by 27 states
[1] 71 27
The data is heavily skewed
hist(mx, main=range(mx))
use 1+log transformation and reverse color scheme
-log(1+mx) %>% as.data.frame.matrix %>% d3heatmap()
💡 除了用顏色表現矩陣裡面的數字,熱圖可以幫我們在矩陣的兩個方向分別做層級式集群分析,幫助我們看到資料裡面的結構。