Loading & Preparing Data

pacman::p_load(dplyr,ggplot2,caTools)
rm(list=ls(all=TRUE))
Sys.setlocale("LC_TIME","C")
## [1] "C"
load("../unit15/data/tf2.rdata")
Spliting for Classification
TR = subset(A, spl)
TS = subset(A, !spl)


Classification Model

glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial()) 
summary(glm1)
## 
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9, 
##     11)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7931  -0.8733  -0.6991   1.0384   1.8735  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.259e+00  1.261e-01  -9.985  < 2e-16 ***
## r            -1.227e-02  8.951e-04 -13.708  < 2e-16 ***
## s             9.566e-03  9.101e-04  10.511  < 2e-16 ***
## f             2.905e-01  1.593e-02  18.233  < 2e-16 ***
## m            -3.028e-05  2.777e-05  -1.090  0.27559    
## rev           4.086e-05  1.940e-05   2.106  0.03521 *  
## raw          -2.306e-04  8.561e-05  -2.693  0.00708 ** 
## agea25       -4.194e-02  8.666e-02  -0.484  0.62838    
## agea30        1.772e-02  7.992e-02   0.222  0.82456    
## agea35        7.705e-02  7.921e-02   0.973  0.33074    
## agea40        8.699e-02  8.132e-02   1.070  0.28476    
## agea45        1.928e-02  8.457e-02   0.228  0.81962    
## agea50        1.745e-02  9.323e-02   0.187  0.85155    
## agea55        1.752e-01  1.094e-01   1.602  0.10926    
## agea60        6.177e-02  1.175e-01   0.526  0.59904    
## agea65        2.652e-01  1.047e-01   2.533  0.01131 *  
## agena        -1.419e-01  1.498e-01  -0.947  0.34347    
## areaz106     -4.105e-02  1.321e-01  -0.311  0.75603    
## areaz110     -2.075e-01  1.045e-01  -1.986  0.04703 *  
## areaz114      3.801e-02  1.111e-01   0.342  0.73214    
## areaz115      2.599e-01  9.682e-02   2.684  0.00727 ** 
## areaz221      1.817e-01  9.753e-02   1.863  0.06243 .  
## areazOthers  -4.677e-02  1.045e-01  -0.448  0.65435    
## areazUnknown -1.695e-01  1.232e-01  -1.375  0.16912    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27629  on 20007  degrees of freedom
## Residual deviance: 23295  on 19984  degrees of freedom
## AIC: 23343
## 
## Number of Fisher Scoring iterations: 5
pred =  predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
##        predict
## actual  FALSE TRUE
##   FALSE  3730  873
##   TRUE   1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts          # 0.69998
## [1] 0.6999767
colAUC(pred, TS$buy)                                   # 0.7556
##                     [,1]
## FALSE vs. TRUE 0.7556038


Regression Model

A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)
## 
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.83304 -0.22812  0.04853  0.28096  1.64243 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.1403704  0.0504979  22.583  < 2e-16 ***
## r             0.0000702  0.0003090   0.227  0.82030    
## s             0.0001173  0.0003123   0.376  0.70721    
## f             0.0256836  0.0017965  14.296  < 2e-16 ***
## m             0.5045943  0.0372711  13.539  < 2e-16 ***
## rev           0.0450307  0.0360945   1.248  0.21222    
## agea25        0.0737926  0.0251165   2.938  0.00331 ** 
## agea30        0.1204660  0.0230651   5.223 1.80e-07 ***
## agea35        0.1264592  0.0227496   5.559 2.79e-08 ***
## agea40        0.1382214  0.0232522   5.944 2.87e-09 ***
## agea45        0.1085828  0.0242698   4.474 7.77e-06 ***
## agea50        0.0787808  0.0264917   2.974  0.00295 ** 
## agea55        0.0703242  0.0312462   2.251  0.02443 *  
## agea60        0.0694822  0.0321119   2.164  0.03051 *  
## agea65       -0.0284007  0.0282282  -1.006  0.31439    
## agena         0.1124434  0.0395590   2.842  0.00449 ** 
## areaz106      0.0789586  0.0435321   1.814  0.06974 .  
## areaz110      0.0375241  0.0353641   1.061  0.28868    
## areaz114     -0.0111101  0.0371762  -0.299  0.76506    
## areaz115      0.0111809  0.0325803   0.343  0.73147    
## areaz221      0.0147066  0.0328140   0.448  0.65403    
## areazOthers   0.0249228  0.0349567   0.713  0.47589    
## areazUnknown  0.0105550  0.0388962   0.271  0.78612    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4216 on 9246 degrees of freedom
## Multiple R-squared:  0.291,  Adjusted R-squared:  0.2893 
## F-statistic: 172.5 on 22 and 9246 DF,  p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
## [1] 0.2909908 0.2575966


Prediction

Aggregate data 2000-12-01 ~ 2001~02-28.

load("../unit15/data/tf0.rdata")
d0 = max(X0$date) + 1
B = X0 %>% 
  filter(date >= as.Date("2000-12-01")) %>% 
  mutate(days = as.integer(difftime(d0, date, units="days"))) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584
nrow(B)
## [1] 28531

In B, there is a record for each customer. B$Buy is the probability of buying in March.

B$Buy = predict(glm1, B, type="response")

💡: 預測購買金額時要記得做指數、對數轉換!

B2 = B %>% mutate_at(c("m","rev"), log10)
B$Rev = 10^predict(lm1, B2)
par(mfrow=c(1,2), cex=0.8)
hist(B$Buy)
hist(log(B$Rev,10))

save(B, file="../unit15/data/tf3.rdata")