pacman::p_load(dplyr,ggplot2,caTools)
rm(list=ls(all=TRUE))
Sys.setlocale("LC_TIME","C")
## [1] "C"
load("../unit15/data/tf2.rdata")
TR = subset(A, spl)
TS = subset(A, !spl)
glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial())
summary(glm1)
##
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9,
## 11)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7931 -0.8733 -0.6991 1.0384 1.8735
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.259e+00 1.261e-01 -9.985 < 2e-16 ***
## r -1.227e-02 8.951e-04 -13.708 < 2e-16 ***
## s 9.566e-03 9.101e-04 10.511 < 2e-16 ***
## f 2.905e-01 1.593e-02 18.233 < 2e-16 ***
## m -3.028e-05 2.777e-05 -1.090 0.27559
## rev 4.086e-05 1.940e-05 2.106 0.03521 *
## raw -2.306e-04 8.561e-05 -2.693 0.00708 **
## agea25 -4.194e-02 8.666e-02 -0.484 0.62838
## agea30 1.772e-02 7.992e-02 0.222 0.82456
## agea35 7.705e-02 7.921e-02 0.973 0.33074
## agea40 8.699e-02 8.132e-02 1.070 0.28476
## agea45 1.928e-02 8.457e-02 0.228 0.81962
## agea50 1.745e-02 9.323e-02 0.187 0.85155
## agea55 1.752e-01 1.094e-01 1.602 0.10926
## agea60 6.177e-02 1.175e-01 0.526 0.59904
## agea65 2.652e-01 1.047e-01 2.533 0.01131 *
## agena -1.419e-01 1.498e-01 -0.947 0.34347
## areaz106 -4.105e-02 1.321e-01 -0.311 0.75603
## areaz110 -2.075e-01 1.045e-01 -1.986 0.04703 *
## areaz114 3.801e-02 1.111e-01 0.342 0.73214
## areaz115 2.599e-01 9.682e-02 2.684 0.00727 **
## areaz221 1.817e-01 9.753e-02 1.863 0.06243 .
## areazOthers -4.677e-02 1.045e-01 -0.448 0.65435
## areazUnknown -1.695e-01 1.232e-01 -1.375 0.16912
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27629 on 20007 degrees of freedom
## Residual deviance: 23295 on 19984 degrees of freedom
## AIC: 23343
##
## Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
## predict
## actual FALSE TRUE
## FALSE 3730 873
## TRUE 1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.69998
## [1] 0.6999767
colAUC(pred, TS$buy) # 0.7556
## [,1]
## FALSE vs. TRUE 0.7556038
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)
##
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.83304 -0.22812 0.04853 0.28096 1.64243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1403704 0.0504979 22.583 < 2e-16 ***
## r 0.0000702 0.0003090 0.227 0.82030
## s 0.0001173 0.0003123 0.376 0.70721
## f 0.0256836 0.0017965 14.296 < 2e-16 ***
## m 0.5045943 0.0372711 13.539 < 2e-16 ***
## rev 0.0450307 0.0360945 1.248 0.21222
## agea25 0.0737926 0.0251165 2.938 0.00331 **
## agea30 0.1204660 0.0230651 5.223 1.80e-07 ***
## agea35 0.1264592 0.0227496 5.559 2.79e-08 ***
## agea40 0.1382214 0.0232522 5.944 2.87e-09 ***
## agea45 0.1085828 0.0242698 4.474 7.77e-06 ***
## agea50 0.0787808 0.0264917 2.974 0.00295 **
## agea55 0.0703242 0.0312462 2.251 0.02443 *
## agea60 0.0694822 0.0321119 2.164 0.03051 *
## agea65 -0.0284007 0.0282282 -1.006 0.31439
## agena 0.1124434 0.0395590 2.842 0.00449 **
## areaz106 0.0789586 0.0435321 1.814 0.06974 .
## areaz110 0.0375241 0.0353641 1.061 0.28868
## areaz114 -0.0111101 0.0371762 -0.299 0.76506
## areaz115 0.0111809 0.0325803 0.343 0.73147
## areaz221 0.0147066 0.0328140 0.448 0.65403
## areazOthers 0.0249228 0.0349567 0.713 0.47589
## areazUnknown 0.0105550 0.0388962 0.271 0.78612
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4216 on 9246 degrees of freedom
## Multiple R-squared: 0.291, Adjusted R-squared: 0.2893
## F-statistic: 172.5 on 22 and 9246 DF, p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
## [1] 0.2909908 0.2575966
Aggregate data 2000-12-01 ~ 2001~02-28.
load("../unit15/data/tf0.rdata")
d0 = max(X0$date) + 1
B = X0 %>%
filter(date >= as.Date("2000-12-01")) %>%
mutate(days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% data.frame # 28584
nrow(B)
## [1] 28531
In B
, there is a record for each customer. B$Buy
is the probability of buying in March.
B$Buy = predict(glm1, B, type="response")
💡: 預測購買金額時要記得做指數、對數轉換!
B2 = B %>% mutate_at(c("m","rev"), log10)
B$Rev = 10^predict(lm1, B2)
par(mfrow=c(1,2), cex=0.8)
hist(B$Buy)
hist(log(B$Rev,10))
save(B, file="../unit15/data/tf3.rdata")