rm(list=ls(all=TRUE))
Sys.setlocale("LC_TIME","C")
## [1] "C"
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr)
load("data/tf0.rdata")
Remove data after the demarcation date
feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01) # 618212
X = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 88387
summary(X)
## tid date cust
## Min. : 1 Min. :2000-11-01 Length:88387
## 1st Qu.:22098 1st Qu.:2000-11-23 Class :character
## Median :44194 Median :2000-12-12 Mode :character
## Mean :44194 Mean :2000-12-15
## 3rd Qu.:66291 3rd Qu.:2001-01-12
## Max. :88387 Max. :2001-01-31
## age area items pieces
## Length:88387 Length:88387 Min. : 1.000 Min. : 1.000
## Class :character Class :character 1st Qu.: 2.000 1st Qu.: 3.000
## Mode :character Mode :character Median : 5.000 Median : 6.000
## Mean : 6.994 Mean : 9.453
## 3rd Qu.: 9.000 3rd Qu.: 12.000
## Max. :112.000 Max. :339.000
## total gross
## Min. : 5.0 Min. :-1645.0
## 1st Qu.: 230.0 1st Qu.: 23.0
## Median : 522.0 Median : 72.0
## Mean : 888.7 Mean : 138.3
## 3rd Qu.: 1120.0 3rd Qu.: 174.0
## Max. :30171.0 Max. : 8069.0
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
## items pieces total gross
## 99.9% 56.0000 84.0000 9378.684 1883.228
## 99.95% 64.0000 98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
d0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% data.frame # 28584
nrow(A)
## [1] 28584
feb = filter(X0, date>= feb01) %>% group_by(cust) %>%
summarise(amount = sum(total)) # 16899
A$amount
Simply a Left Joint
A = merge(A, feb, by="cust", all.x=T)
A$buy
A$buy = !is.na(A$amount)
summary(A)
## cust r s f
## Length:28584 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Mode :character Median :21.00 Median :68.00 Median : 2.000
## Mean :32.12 Mean :61.27 Mean : 3.089
## 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :92.00 Max. :92.00 Max. :60.000
##
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -742.0 Length:28584
## 1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 Class :character
## Median : 709.5 Median : 1566 Median : 218.0 Mode :character
## Mean : 1012.4 Mean : 2711 Mean : 420.8
## 3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0
## Max. :10634.0 Max. :99597 Max. :15565.0
##
## area amount buy
## Length:28584 Min. : 8 Mode :logical
## Class :character 1st Qu.: 454 FALSE:15342
## Mode :character Median : 993 TRUE :13242
## Mean : 1499
## 3rd Qu.: 1955
## Max. :28089
## NA's :15342
tapply(A$buy, A$age, mean) %>% barplot
abline(h = mean(A$buy), col='red')
tapply(A$buy, A$area, mean) %>% barplot(las=2)
abline(h = mean(A$buy), col='red')
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
## [1] 28584 20008 8576
cbind(A, spl) %>% filter(buy) %>%
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13242 9269 3973
cbind(A2, spl2) %>%
ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)
save(Z, X, A, spl, spl2, file="data/tf2.rdata")