pacman::p_load(dplyr, ggplot2, car, vcd, GGally, mvtnorm)
W = read.csv('../unit09/data/wholesales.csv')
W$Channel = factor( paste0("Ch",W$Channel) )
W$Region = factor( paste0("Reg",W$Region) )
W[3:8] = lapply(W[3:8], log, base=10)
summary(W)
Channel Region Fresh Milk Grocery
Ch1:298 Reg1: 77 Min. :0.477 Min. :1.74 Min. :0.477
Ch2:142 Reg2: 47 1st Qu.:3.495 1st Qu.:3.19 1st Qu.:3.333
Reg3:316 Median :3.930 Median :3.56 Median :3.677
Mean :3.792 Mean :3.53 Mean :3.666
3rd Qu.:4.229 3rd Qu.:3.86 3rd Qu.:4.028
Max. :5.050 Max. :4.87 Max. :4.968
Frozen Detergents_Paper Delicassen
Min. :1.40 Min. :0.477 Min. :0.477
1st Qu.:2.87 1st Qu.:2.409 1st Qu.:2.611
Median :3.18 Median :2.912 Median :2.985
Mean :3.17 Mean :2.947 Mean :2.895
3rd Qu.:3.55 3rd Qu.:3.594 3rd Qu.:3.260
Max. :4.78 Max. :4.611 Max. :4.681
B1. Correlation
cor(W$Milk, W$Grocery)
[1] 0.75885
B2. Correlation Test
cor.test(W$Milk, W$Grocery)
Pearson's product-moment correlation
data: W$Milk and W$Grocery
t = 24.4, df = 438, p-value <0.0000000000000002
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.71617 0.79588
sample estimates:
cor
0.75885
B3. Simple Scatter Plot
par(cex=0.7, mar=c(4,4,2,2))
plot(W$Milk, W$Grocery)
B3. Scatter Plot with Regrssion Line
ggplot(W, aes(x=Milk, y=Grocery)) +
geom_point(alpha=0.4) +
geom_smooth(method="lm")
B4. Simulating Bi-Variate Normal Distibution
par(cex=0.7, mar=c(1,1,1,1), mfrow=c(3,3))
for(r in seq(-1,1,0.25)) {
mu = c(0,0)
sigma = matrix(c(1,r,r,1),nrow=2) # covariance matrix
rmvnorm(500, mu, sigma) %>% plot(col='gray')
text(0,0,r,cex=2,col='blue',font=2)
}
C1. Matrix of Correlation Coefficients
cor(W[,3:8])
Fresh Milk Grocery Frozen Detergents_Paper
Fresh 1.000000 -0.019834 -0.13271 0.383996 -0.15587
Milk -0.019834 1.000000 0.75885 -0.055316 0.67794
Grocery -0.132713 0.758851 1.00000 -0.164524 0.79640
Frozen 0.383996 -0.055316 -0.16452 1.000000 -0.21158
Detergents_Paper -0.155871 0.677942 0.79640 -0.211576 1.00000
Delicassen 0.255186 0.337833 0.23573 0.254718 0.16674
Delicassen
Fresh 0.25519
Milk 0.33783
Grocery 0.23573
Frozen 0.25472
Detergents_Paper 0.16674
Delicassen 1.00000
C1. Matrix of Scatter Plots
car::scatterplotMatrix(W[,3:8])
D1. Chisq-Test for Association
table(W$Channel, W$Region) %>% chisq.test()
Pearson's Chi-squared test
data: .
X-squared = 4.35, df = 2, p-value = 0.11
Channel
and Region
is not significantChannel
does not depends on Region
and vice versa.D2. Mosaic Plot
library(vcd)
structable(Channel ~ Region, W) %>%
mosaic(shade=T, labeling=labeling_residuals)
D3. Another Example
haireye <- margin.table(HairEyeColor, 1:2)
haireye
Eye
Hair Brown Blue Hazel Green
Black 68 20 15 5
Brown 119 84 54 29
Red 26 17 14 14
Blond 7 94 10 16
mosaic(haireye, shade=T, labeling=labeling_residuals)
💡 學習重點:
■ The Association between Hair
and Eye
is significant
■ For Black-Hair:
■ the propotion of Brown-Eye is significantly higher than the expected
■ the propotion of Blue-Eye is significantly lower
■ For Red-Hair:
■ he propotion of Green-Eye is significantly higher
■ For Blond-Hair:
■ the propotion of Brown-Eye is significantly lower
■ the propotion of Blue-Eye is significantly higher
■ the propotion of Hazel-Eye is significantly lower
D4. The Expected Value
in Contingent Table
( expected = independence_table(haireye) )
Eye
Hair Brown Blue Hazel Green
Black 40.135 39.223 16.966 11.6757
Brown 106.284 103.868 44.929 30.9189
Red 26.385 25.785 11.154 7.6757
Blond 47.196 46.123 19.951 13.7297
It is derived from the marginal probability in both directions
(rowSums(haireye) %o% colSums(haireye)) / sum(haireye)
Brown Blue Hazel Green
Black 40.135 39.223 16.966 11.6757
Brown 106.284 103.868 44.929 30.9189
Red 26.385 25.785 11.154 7.6757
Blond 47.196 46.123 19.951 13.7297
D5. Residuals
( residuals = haireye - expected )
Eye
Hair Brown Blue Hazel Green
Black 27.86486 -19.22297 -1.96622 -6.67568
Brown 12.71622 -19.86824 9.07095 -1.91892
Red -0.38514 -8.78547 2.84628 6.32432
Blond -40.19595 47.87669 -9.95101 2.27027
D6. Standardized Residuals
( std.residuals = residuals / sqrt(expected) )
Eye
Hair Brown Blue Hazel Green
Black 4.398399 -3.069377 -0.477352 -1.953684
Brown 1.233458 -1.949477 1.353284 -0.345100
Red -0.074978 -1.730125 0.852253 2.282737
Blond -5.850997 7.049590 -2.227844 0.612698
🧙 熱圖與馬賽克圖的比較 Heatmap.Rmd
library(GGally)
ggpairs(iris, aes(colour = Species, alpha=0.4),
lower=list(combo = wrap("facethist", binwidth = 0.2)))
par(cex=0.8, mar=c(2,3,1,1), mfrow=c(1,1))
table(W$Channel) %>% barplot
par(cex=0.8, mar=c(4,3,1,1))
hist(W$Fresh)
par(cex=0.8, mar=c(2,3,1,1))
table(W$Channel, W$Region) %>% barplot()
tapply(W$Milk, W$Region, sum)
Reg1 Reg2 Reg3
270.08 163.29 1118.47
tapply(W$Milk, list(W$Channel, W$Region), sum)
Reg1 Reg2 Reg3
Ch1 198.750 90.366 706.21
Ch2 71.331 72.926 412.26
xtabs(Milk ~ Channel + Region, data=W)
Region
Channel Reg1 Reg2 Reg3
Ch1 198.750 90.366 706.214
Ch2 71.331 72.926 412.260
ggplot(W, aes(x=log(Milk))) +
geom_histogram(aes(fill=Region), alpha=0.5, bins=20) +
facet_grid(Channel~Region) +
labs(title="Dist. of Sales of Milk")
ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
geom_point() +
stat_smooth(method="lm",se=F)
ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
geom_point() +
stat_smooth(method="lm", se=F) +
facet_grid(~Region)
ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
geom_point() +
stat_smooth(method="lm", se=F, col='red') +
facet_grid(Channel~Region)
💡 學習重點: 存在整個族群的關係和個別分群之中的關係有可能是不同(甚至是相反)的!
load("../unit09/data/olist.rdata")
P2 = group_by(R, order_id) %>%
summarise(score = mean(review_score)) %>%
right_join(O) %>% select(order_id, score) %>%
right_join(I) %>% group_by(product_id) %>%
summarise(score = mean(score)) %>%
right_join(P) %>% left_join(TPC) %>%
rename(category = product_category_name_english)
Joining, by = "order_id"
Joining, by = "order_id"
Joining, by = "product_id"
Joining, by = "product_category_name"
cor.test(P2$score, P2$product_photos_qty, use="complete.obs")
Pearson's product-moment correlation
data: P2$score and P2$product_photos_qty
t = 3.92, df = 32300, p-value = 0.00009
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.010881 0.032669
sample estimates:
cor
0.021778
K = 40
topK = table(P2$category) %>% sort %>% tail(K) %>% names
PK = filter(P2, category %in% topK)
par(cex=0.6, mar=c(4,20,4,2))
split(PK, PK$category) %>%
sapply(function(df) {
cor(df$score, df$product_photos_qty, use="complete.obs")
}) %>% sort %>%
barplot(horiz=T, las=2, main=sprintf(
"Cor(Rev.Score, Photo.Qty), Top%d", K))
abline(v=seq(-0.1, 0.1, 0.05), col="red", lty=2)