資料準備流程

Fig-1: Data Preparation

Fig-1: Data Preparation


Preparing The Predictors (X)

rm(list=ls(all=TRUE))
Sys.setlocale("LC_TIME","C")
## [1] "C"
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr)
load("data/tf0.rdata")
The Demarcation Date

Remove data after the demarcation date

feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01)    # 618212
Aggregate for the Transaction Records
X = group_by(Z, tid) %>% summarise(
  date = first(date),  # 交易日期
  cust = first(cust),  # 顧客 ID
  age = first(age),    # 顧客 年齡級別
  area = first(area),  # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame  # 88387
summary(X)
##       tid             date                cust          
##  Min.   :    1   Min.   :2000-11-01   Length:88387      
##  1st Qu.:22098   1st Qu.:2000-11-23   Class :character  
##  Median :44194   Median :2000-12-12   Mode  :character  
##  Mean   :44194   Mean   :2000-12-15                     
##  3rd Qu.:66291   3rd Qu.:2001-01-12                     
##  Max.   :88387   Max.   :2001-01-31                     
##      age                area               items             pieces       
##  Length:88387       Length:88387       Min.   :  1.000   Min.   :  1.000  
##  Class :character   Class :character   1st Qu.:  2.000   1st Qu.:  3.000  
##  Mode  :character   Mode  :character   Median :  5.000   Median :  6.000  
##                                        Mean   :  6.994   Mean   :  9.453  
##                                        3rd Qu.:  9.000   3rd Qu.: 12.000  
##                                        Max.   :112.000   Max.   :339.000  
##      total             gross        
##  Min.   :    5.0   Min.   :-1645.0  
##  1st Qu.:  230.0   1st Qu.:   23.0  
##  Median :  522.0   Median :   72.0  
##  Mean   :  888.7   Mean   :  138.3  
##  3rd Qu.: 1120.0   3rd Qu.:  174.0  
##  Max.   :30171.0   Max.   : 8069.0
Check Quantile and Remove Outlier
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
##          items   pieces     total    gross
## 99.9%  56.0000  84.0000  9378.684 1883.228
## 99.95% 64.0000  98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
Aggregate for Customer Records
d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584
nrow(A)
## [1] 28584



Preparing the Target Variables (Y)

Aggregate Feb’s Transaction by Customer
feb = filter(X0, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(total))  # 16899
The Target for Regression - A$amount

Simply a Left Joint

A = merge(A, feb, by="cust", all.x=T)
The Target for Classification - A$buy
A$buy = !is.na(A$amount)
Summary of the Dataset
summary(A)
##      cust                 r               s               f         
##  Length:28584       Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  Class :character   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Mode  :character   Median :21.00   Median :68.00   Median : 2.000  
##                     Mean   :32.12   Mean   :61.27   Mean   : 3.089  
##                     3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##                     Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m                rev             raw              age           
##  Min.   :    8.0   Min.   :    8   Min.   : -742.0   Length:28584      
##  1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   Class :character  
##  Median :  709.5   Median : 1566   Median :  218.0   Mode  :character  
##  Mean   : 1012.4   Mean   : 2711   Mean   :  420.8                     
##  3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0                     
##  Max.   :10634.0   Max.   :99597   Max.   :15565.0                     
##                                                                        
##      area               amount         buy         
##  Length:28584       Min.   :    8   Mode :logical  
##  Class :character   1st Qu.:  454   FALSE:15342    
##  Mode  :character   Median :  993   TRUE :13242    
##                     Mean   : 1499                  
##                     3rd Qu.: 1955                  
##                     Max.   :28089                  
##                     NA's   :15342
The Association of Categorial Predictors
tapply(A$buy, A$age, mean) %>% barplot
abline(h = mean(A$buy), col='red')

tapply(A$buy, A$area, mean) %>% barplot(las=2)
abline(h = mean(A$buy), col='red')

Contest Dataset
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
## [1] 28584 20008  8576
cbind(A, spl) %>% filter(buy) %>% 
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)

A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13242  9269  3973
cbind(A2, spl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)

save(Z, X, A, spl, spl2, file="data/tf2.rdata")