pacman::p_load(dplyr, ggplot2, car, vcd, GGally, mvtnorm)


【A】批發商資料集

W = read.csv('../unit09/data/wholesales.csv')
W$Channel = factor( paste0("Ch",W$Channel) )
W$Region = factor( paste0("Reg",W$Region) )
W[3:8] = lapply(W[3:8], log, base=10)
summary(W)
 Channel    Region        Fresh            Milk         Grocery     
 Ch1:298   Reg1: 77   Min.   :0.477   Min.   :1.74   Min.   :0.477  
 Ch2:142   Reg2: 47   1st Qu.:3.495   1st Qu.:3.19   1st Qu.:3.333  
           Reg3:316   Median :3.930   Median :3.56   Median :3.677  
                      Mean   :3.792   Mean   :3.53   Mean   :3.666  
                      3rd Qu.:4.229   3rd Qu.:3.86   3rd Qu.:4.028  
                      Max.   :5.050   Max.   :4.87   Max.   :4.968  
     Frozen     Detergents_Paper   Delicassen   
 Min.   :1.40   Min.   :0.477    Min.   :0.477  
 1st Qu.:2.87   1st Qu.:2.409    1st Qu.:2.611  
 Median :3.18   Median :2.912    Median :2.985  
 Mean   :3.17   Mean   :2.947    Mean   :2.895  
 3rd Qu.:3.55   3rd Qu.:3.594    3rd Qu.:3.260  
 Max.   :4.78   Max.   :4.611    Max.   :4.681  


【B】連續變數的相關性(係數) Correlation

B1. Correlation

cor(W$Milk, W$Grocery)
[1] 0.75885

B2. Correlation Test

cor.test(W$Milk, W$Grocery)

    Pearson's product-moment correlation

data:  W$Milk and W$Grocery
t = 24.4, df = 438, p-value <0.0000000000000002
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.71617 0.79588
sample estimates:
    cor 
0.75885 

B3. Simple Scatter Plot

par(cex=0.7, mar=c(4,4,2,2))
plot(W$Milk, W$Grocery)

B3. Scatter Plot with Regrssion Line

ggplot(W, aes(x=Milk, y=Grocery)) +
  geom_point(alpha=0.4) +
  geom_smooth(method="lm")

B4. Simulating Bi-Variate Normal Distibution

par(cex=0.7, mar=c(1,1,1,1), mfrow=c(3,3))
for(r in seq(-1,1,0.25)) {
  mu = c(0,0)
  sigma = matrix(c(1,r,r,1),nrow=2)   # covariance matrix 
  rmvnorm(500, mu, sigma) %>% plot(col='gray')
  text(0,0,r,cex=2,col='blue',font=2)
  }


【C】相關性矩陣

C1. Matrix of Correlation Coefficients

cor(W[,3:8])
                     Fresh      Milk  Grocery    Frozen Detergents_Paper
Fresh             1.000000 -0.019834 -0.13271  0.383996         -0.15587
Milk             -0.019834  1.000000  0.75885 -0.055316          0.67794
Grocery          -0.132713  0.758851  1.00000 -0.164524          0.79640
Frozen            0.383996 -0.055316 -0.16452  1.000000         -0.21158
Detergents_Paper -0.155871  0.677942  0.79640 -0.211576          1.00000
Delicassen        0.255186  0.337833  0.23573  0.254718          0.16674
                 Delicassen
Fresh               0.25519
Milk                0.33783
Grocery             0.23573
Frozen              0.25472
Detergents_Paper    0.16674
Delicassen          1.00000

C1. Matrix of Scatter Plots

car::scatterplotMatrix(W[,3:8])


【D】類別變數之間的關聯性 Association

D1. Chisq-Test for Association

table(W$Channel, W$Region) %>% chisq.test()

    Pearson's Chi-squared test

data:  .
X-squared = 4.35, df = 2, p-value = 0.11

D2. Mosaic Plot

library(vcd)
structable(Channel ~ Region, W) %>% 
  mosaic(shade=T, labeling=labeling_residuals)

D3. Another Example

haireye <- margin.table(HairEyeColor, 1:2)
haireye
       Eye
Hair    Brown Blue Hazel Green
  Black    68   20    15     5
  Brown   119   84    54    29
  Red      26   17    14    14
  Blond     7   94    10    16
mosaic(haireye, shade=T, labeling=labeling_residuals)

💡 學習重點:
  ■ The Association between Hair and Eye is significant
  ■ For Black-Hair:
    ■ the propotion of Brown-Eye is significantly higher than the expected
    ■ the propotion of Blue-Eye is significantly lower
  ■ For Red-Hair:
    ■ he propotion of Green-Eye is significantly higher
  ■ For Blond-Hair:
    ■ the propotion of Brown-Eye is significantly lower
    ■ the propotion of Blue-Eye is significantly higher
    ■ the propotion of Hazel-Eye is significantly lower


D4. The Expected Value in Contingent Table

( expected = independence_table(haireye) )
       Eye
Hair      Brown    Blue  Hazel   Green
  Black  40.135  39.223 16.966 11.6757
  Brown 106.284 103.868 44.929 30.9189
  Red    26.385  25.785 11.154  7.6757
  Blond  47.196  46.123 19.951 13.7297

It is derived from the marginal probability in both directions

(rowSums(haireye) %o% colSums(haireye)) / sum(haireye)
        Brown    Blue  Hazel   Green
Black  40.135  39.223 16.966 11.6757
Brown 106.284 103.868 44.929 30.9189
Red    26.385  25.785 11.154  7.6757
Blond  47.196  46.123 19.951 13.7297

D5. Residuals

( residuals = haireye - expected ) 
       Eye
Hair        Brown      Blue     Hazel     Green
  Black  27.86486 -19.22297  -1.96622  -6.67568
  Brown  12.71622 -19.86824   9.07095  -1.91892
  Red    -0.38514  -8.78547   2.84628   6.32432
  Blond -40.19595  47.87669  -9.95101   2.27027

D6. Standardized Residuals

( std.residuals = residuals / sqrt(expected)  ) 
       Eye
Hair        Brown      Blue     Hazel     Green
  Black  4.398399 -3.069377 -0.477352 -1.953684
  Brown  1.233458 -1.949477  1.353284 -0.345100
  Red   -0.074978 -1.730125  0.852253  2.282737
  Blond -5.850997  7.049590 -2.227844  0.612698


🧙 熱圖與馬賽克圖的比較 Heatmap.Rmd



【E】類別與連續變數之間的關係 - GGally

library(GGally)
ggpairs(iris, aes(colour = Species, alpha=0.4),
        lower=list(combo = wrap("facethist", binwidth = 0.2)))

【F】類別與連續變數的各種可能關係圖示

F1. 單一類別變數
par(cex=0.8, mar=c(2,3,1,1), mfrow=c(1,1))
table(W$Channel) %>% barplot

F2. 單一連續變數
par(cex=0.8, mar=c(4,3,1,1))
hist(W$Fresh)

F3. 兩類別變數
par(cex=0.8, mar=c(2,3,1,1))
table(W$Channel, W$Region) %>% barplot()

F4. 類別 X 數量
tapply(W$Milk, W$Region, sum)
   Reg1    Reg2    Reg3 
 270.08  163.29 1118.47 
tapply(W$Milk, list(W$Channel, W$Region), sum)
       Reg1   Reg2   Reg3
Ch1 198.750 90.366 706.21
Ch2  71.331 72.926 412.26
xtabs(Milk ~ Channel + Region, data=W)
       Region
Channel    Reg1    Reg2    Reg3
    Ch1 198.750  90.366 706.214
    Ch2  71.331  72.926 412.260
ggplot(W, aes(x=log(Milk))) +
  geom_histogram(aes(fill=Region), alpha=0.5, bins=20) +
  facet_grid(Channel~Region) +
  labs(title="Dist. of Sales of Milk")

F5. 數量 X 數量 (X 類別)
ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
  geom_point() +
  stat_smooth(method="lm",se=F)

ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
  geom_point() +
  stat_smooth(method="lm", se=F) +
  facet_grid(~Region)

ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
  geom_point() +
  stat_smooth(method="lm", se=F, col='red') +
  facet_grid(Channel~Region)


💡 學習重點: 存在整個族群的關係和個別分群之中的關係有可能是不同(甚至是相反)的!

load("../unit09/data/olist.rdata")

P2 = group_by(R, order_id) %>% 
  summarise(score = mean(review_score)) %>% 
  right_join(O) %>% select(order_id, score) %>% 
  right_join(I) %>% group_by(product_id) %>% 
  summarise(score = mean(score)) %>% 
  right_join(P) %>% left_join(TPC) %>% 
  rename(category = product_category_name_english)
Joining, by = "order_id"
Joining, by = "order_id"
Joining, by = "product_id"
Joining, by = "product_category_name"
cor.test(P2$score, P2$product_photos_qty, use="complete.obs")

    Pearson's product-moment correlation

data:  P2$score and P2$product_photos_qty
t = 3.92, df = 32300, p-value = 0.00009
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.010881 0.032669
sample estimates:
     cor 
0.021778 
K = 40
topK = table(P2$category) %>% sort %>% tail(K) %>% names
PK = filter(P2, category %in% topK)
par(cex=0.6, mar=c(4,20,4,2))
split(PK, PK$category) %>% 
  sapply(function(df) {
    cor(df$score, df$product_photos_qty, use="complete.obs")
  }) %>% sort %>% 
  barplot(horiz=T, las=2, main=sprintf(
    "Cor(Rev.Score, Photo.Qty), Top%d", K))
abline(v=seq(-0.1, 0.1, 0.05), col="red", lty=2)