Z
rm(list=ls(all=T))
Sys.setlocale("LC_TIME","C")
## [1] "C"
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr)
Z = read_csv("data/ta_feng_all_months_merged.csv") %>% data.frame %>%
setNames(c("date","cust","age","area","cat","prod","qty","cost","price"))
## Parsed with column specification:
## cols(
## TRANSACTION_DT = col_character(),
## CUSTOMER_ID = col_character(),
## AGE_GROUP = col_character(),
## PIN_CODE = col_character(),
## PRODUCT_SUBCLASS = col_integer(),
## PRODUCT_ID = col_character(),
## AMOUNT = col_integer(),
## ASSET = col_integer(),
## SALES_PRICE = col_integer()
## )
nrow(Z)
## [1] 817741
Z$date = as.Date(Z$date, format="%m/%d/%Y")
Z$age[is.na(Z$age)] = "na"
Z$age = factor(Z$age, levels=c(
"<25","25-29","30-34","35-39","40-44","45-49","50-54","55-59","60-64",">65","na"), labels=c(
"a20","a25","a30","a35","a40","a45","a50","a55","a60","a65","na"
)) %>% as.character
Z$area = paste0("z",Z$area)
summary(Z)
## date cust age
## Min. :2000-11-01 Length:817741 Length:817741
## 1st Qu.:2000-11-28 Class :character Class :character
## Median :2001-01-01 Mode :character Mode :character
## Mean :2000-12-30
## 3rd Qu.:2001-01-30
## Max. :2001-02-28
## area cat prod qty
## Length:817741 Min. :100101 Length:817741 Min. : 1.000
## Class :character 1st Qu.:110106 Class :character 1st Qu.: 1.000
## Mode :character Median :130106 Mode :character Median : 1.000
## Mean :284951 Mean : 1.382
## 3rd Qu.:520314 3rd Qu.: 1.000
## Max. :780510 Max. :1200.000
## cost price
## Min. : 0.0 Min. : 1.0
## 1st Qu.: 35.0 1st Qu.: 42.0
## Median : 62.0 Median : 76.0
## Mean : 112.1 Mean : 131.9
## 3rd Qu.: 112.0 3rd Qu.: 132.0
## Max. :432000.0 Max. :444000.0
# Quantile of Variables
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))
## qty cost price
## 99% 6 858.0 1014.00
## 99.9% 14 2722.0 3135.82
## 99.95% 24 3799.3 3999.00
# Remove Outliers
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000)
nrow(Z)
## [1] 817182
Z$tid = group_indices(Z, date, cust) # same customer same day
# No. cust, cat, prod, tid
sapply(Z[c("cust","cat","prod","tid")], n_distinct)
## cust cat prod tid
## 32256 2007 23789 119422
# Summary of Item Records
summary(Z)
## date cust age
## Min. :2000-11-01 Length:817182 Length:817182
## 1st Qu.:2000-11-28 Class :character Class :character
## Median :2001-01-01 Mode :character Mode :character
## Mean :2000-12-30
## 3rd Qu.:2001-01-30
## Max. :2001-02-28
## area cat prod qty
## Length:817182 Min. :100101 Length:817182 Min. : 1.000
## Class :character 1st Qu.:110106 Class :character 1st Qu.: 1.000
## Mode :character Median :130106 Mode :character Median : 1.000
## Mean :284784 Mean : 1.358
## 3rd Qu.:520311 3rd Qu.: 1.000
## Max. :780510 Max. :24.000
## cost price tid
## Min. : 0.0 Min. : 1.0 Min. : 1
## 1st Qu.: 35.0 1st Qu.: 42.0 1st Qu.: 28783
## Median : 62.0 Median : 76.0 Median : 59391
## Mean : 106.2 Mean : 125.5 Mean : 58845
## 3rd Qu.: 112.0 3rd Qu.: 132.0 3rd Qu.: 87391
## Max. :3798.0 Max. :4000.0 Max. :119422
X
X = Z %>% group_by(tid) %>% summarise(
date = date[1], # 交易日期
cust = cust[1], # 顧客 ID
age = age[1], # 顧客 年齡級別
area = area[1], # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(X) # 119422
## [1] 119422
summary(X)
## tid date cust
## Min. : 1 Min. :2000-11-01 Length:119422
## 1st Qu.: 29856 1st Qu.:2000-11-29 Class :character
## Median : 59712 Median :2001-01-01 Mode :character
## Mean : 59712 Mean :2000-12-31
## 3rd Qu.: 89567 3rd Qu.:2001-02-02
## Max. :119422 Max. :2001-02-28
## age area items pieces
## Length:119422 Length:119422 Min. : 1.000 Min. : 1.000
## Class :character Class :character 1st Qu.: 2.000 1st Qu.: 3.000
## Mode :character Mode :character Median : 5.000 Median : 6.000
## Mean : 6.843 Mean : 9.294
## 3rd Qu.: 9.000 3rd Qu.: 12.000
## Max. :112.000 Max. :339.000
## total gross
## Min. : 5 Min. :-1645.0
## 1st Qu.: 227 1st Qu.: 21.0
## Median : 510 Median : 68.0
## Mean : 859 Mean : 132.3
## 3rd Qu.: 1082 3rd Qu.: 169.0
## Max. :30171 Max. : 8069.0
# Check Quantile & Remove Outliers
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
## items pieces total gross
## 99.9% 54 81.0000 9009.579 1824.737
## 99.95% 62 94.2895 10611.579 2179.817
## 99.99% 82 133.0000 16044.401 3226.548
# Remove Outliers
X = subset(X, items<=62 & pieces<95 & total<16000) # 119328
par(cex=0.8)
hist(X$date, "weeks", freq=T, las=2, main="No. Transaction per Week")
A
d0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% data.frame # 33241
nrow(A)
## [1] 32241
summary(A)
## cust r s f
## Length:32241 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.: 9.00 1st Qu.: 56.00 1st Qu.: 1.000
## Mode :character Median : 26.00 Median : 92.00 Median : 2.000
## Mean : 37.45 Mean : 80.78 Mean : 3.701
## 3rd Qu.: 60.00 3rd Qu.:110.00 3rd Qu.: 4.000
## Max. :120.00 Max. :120.00 Max. :85.000
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -784.0 Length:32241
## 1st Qu.: 365.0 1st Qu.: 707 1st Qu.: 75.0 Class :character
## Median : 705.7 Median : 1750 Median : 241.0 Mode :character
## Mean : 993.1 Mean : 3152 Mean : 484.6
## 3rd Qu.: 1291.0 3rd Qu.: 3968 3rd Qu.: 612.0
## Max. :12636.0 Max. :127686 Max. :20273.0
## area
## Length:32241
## Class :character
## Mode :character
##
##
##
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m'))
hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)
is.na(Z) %>% colSums
## date cust age area cat prod qty cost price tid
## 0 0 0 0 0 0 0 0 0 0
is.na(X) %>% colSums
## tid date cust age area items pieces total gross
## 0 0 0 0 0 0 0 0 0
is.na(A) %>% colSums
## cust r s f m rev raw age area
## 0 0 0 0 0 0 0 0 0
A0 = A; X0 = X; Z0 = Z
save(Z0, X0, A0, file="data/tf0.rdata")