1. 讀進資料
TX = read.parquet("hdfs:/home/tonychuo/acquire/pq/TX")
2. 資料表格 (結構化資料)
id chain dept category company brand date productsize
1 4351032773 191 37 3711 105100050 2820 2012-11-15 15.25
2 4351032773 191 33 3303 102840020 10241 2012-11-15 10.00
3 4351032773 191 37 3706 105010050 8089 2012-11-15 14.00
4 4351032773 191 0 0 10000 0 2012-11-16 0.00
5 4351032773 191 58 5828 107790070 9200 2012-11-16 9.60
6 4351032773 191 56 5620 107283070 18390 2012-11-16 16.00
7 4351032773 191 27 2707 1082927484 11516 2012-11-16 13.50
8 4351032773 191 27 2702 101780010 95301 2012-11-16 16.50
9 4351032773 191 59 5906 104060040 26155 2012-11-16 15.00
10 4351032773 191 95 9518 106700868 25075 2012-11-16 1.00
productmeasure purchasequantity purchaseamount
1 OZ 1 2.59
2 OZ 1 2.99
3 OZ 1 2.03
4 <NA> 1 0.00
5 OZ 1 3.99
6 OZ 1 5.09
7 LB 1 11.51
8 LB 2 45.66
9 OZ 1 2.97
10 CT 1 4.49
資料欄位、格式
root
|-- id: long (nullable = true)
|-- chain: integer (nullable = true)
|-- dept: integer (nullable = true)
|-- category: integer (nullable = true)
|-- company: long (nullable = true)
|-- brand: integer (nullable = true)
|-- date: timestamp (nullable = true)
|-- productsize: double (nullable = true)
|-- productmeasure: string (nullable = true)
|-- purchasequantity: integer (nullable = true)
|-- purchaseamount: double (nullable = true)
資料概況
TX %>% agg(
n_id = n_distinct(TX$id),
n_chain = n_distinct(TX$chain),
n_dept = n_distinct(TX$dept),
n_category = n_distinct(TX$category),
n_brand = n_distinct(TX$brand)
) %>% collect
n_id n_chain n_dept n_category n_brand
1 311541 134 83 836 35689
3. 資料彙整
mxCD = sum(pivot(groupBy(TX,"chain"),"dept"),"purchaseamount") %>% collect
sparkR.session.stop() # disconnect from Spark
[1] 134 84
chain 0 1 2 3 4 5 6 7
1 392 -32.39 2760.8 299.6 782.65 33872 6887.2 9848 6325.4
2 31 5638.81 214667.1 108830.7 127093.08 412373 798062.6 249889 481776.9
3 85 42662.42 69284.2 67079.9 59267.10 135816 372501.3 64452 198151.4
4 65 -13384.33 37074.6 53812.0 55219.92 128390 288490.7 44887 180507.5
5 133 -9769.70 31743.1 34163.2 50032.29 132509 239754.4 43712 105679.4
8 9 10 11 12 13 14 15
1 1009.6 10163 263.65 1580.1 861.13 34.45 248.12 662.66
2 700112.9 2259195 172472.12 130178.1 130442.94 60450.09 104130.58 49651.31
3 267450.4 943426 68292.48 55993.9 57666.30 37354.24 77625.21 15594.44
4 214341.7 784570 65302.01 56646.6 35889.29 19083.68 54664.60 14196.24
5 127047.8 473862 40022.06 29166.1 20558.69 9429.08 40888.41 12992.61
16 17 18 19 20 21 22 23
1 3733.7 3909.7 18539 1150.5 260.86 8370.4 2444.9 73.19
2 78072.4 144166.8 443615 215318.1 52500.82 894716.9 157686.0 152642.62
3 23961.5 71193.8 147072 80125.1 20429.08 270978.5 67714.8 54870.75
4 17093.9 60218.7 117057 67045.7 17402.25 203122.5 60833.9 46758.21
5 14987.2 38086.6 98852 47961.4 14103.83 104500.3 42706.6 32444.84
24 25 26 27 28 29 30 31
1 343.68 1789.9 44299 8682.2 283.76 1196.2 468.82 172.5
2 98639.68 365893.7 1235240 736578.6 142385.17 562177.8 337199.63 225491.6
3 54893.76 130967.5 610343 444188.9 58910.95 248728.0 144551.62 74017.7
4 32105.04 118882.6 503511 394494.1 40198.42 195813.1 101799.16 57138.8
5 10600.95 65071.3 317188 173503.8 32764.77 106089.4 54132.32 39948.8
32 33 34 35 36 37 38 39 40
1 186.3 18893 16689 7760.1 22411 1913.4 411.75 30.08 316.24
2 251740.1 1341723 125330 379395.9 2164925 282817.2 123635.67 43223.69 78561.68
3 89405.0 507796 55103 215384.0 783295 175828.8 81527.21 20047.77 37133.33
4 72232.9 447859 45848 181149.4 687598 115679.1 47367.25 15575.65 26695.80
5 31067.4 270931 38995 103859.7 316273 80307.9 18770.89 10603.34 15612.14
41 42 43 44 45 46 47 49
1 239.85 NA 80.37 18.03 605.37 NA NA 154.62
2 391026.25 36747.1 41554.86 150608.42 847829.17 45201.2 16437.5 127472.84
3 208502.48 14171.3 20358.78 88399.39 348248.07 12082.9 6025.8 60401.20
4 143766.59 6772.7 17034.13 76900.45 232277.13 9816.7 4247.3 52775.81
5 85002.28 3910.0 10718.72 31356.01 126961.91 12455.0 9967.8 25661.98
50 51 53 54 55 56 57 58
1 124.78 4204.1 NA 390.07 4333.6 1320.6 335.24 1968.4
2 27447.93 957983.1 202207 186164.68 733628.2 1198395.8 369486.16 1646493.8
3 16843.89 548470.0 105220 94431.28 277246.8 447608.9 122382.43 835174.6
4 12686.67 442044.2 86524 88766.11 228970.3 342743.8 99015.77 599248.6
5 6709.02 251099.7 44456 42027.54 113538.7 289438.4 93363.72 378984.4
59 60 62 63 64 65 67 69 70
1 392.07 73.93 NA 8663.8 10644 7.18 2044.6 8761.2 12408.1
2 271850.84 119583.85 88669 2117395.9 5157154 22472.73 23712.7 77119.1 63274.7
3 94306.29 75087.35 26414 706086.9 587316 13388.28 6053.9 29265.9 13963.7
4 75728.03 61623.72 23550 581404.1 542300 12673.39 4160.8 20986.8 9844.8
5 53175.31 36513.37 16012 391630.3 57320 6076.52 6435.3 18485.7 11375.1
71 72 73 74 75 78 79 81 82 91
1 26004 31536 115774 17141 25616 2522.9 3923.5 30496 NA 319.87
2 195735 199660 513642 113054 245012 3587.8 86720.7 629304 64303.2 5735899.64
3 129533 69098 198573 37467 31861 3024.7 27982.0 185839 4814.2 4672.11
4 46329 51354 118793 25446 24263 1398.8 26520.1 387960 3069.6 5006.08
5 47732 46572 212787 28856 29961 2164.1 21746.7 84934 55531.2 6709.47
92 94 95 96 97 98 99
1 52787 NA 19561 10031 27984 1663.3 24.94
2 108586 NA 84278 174534 698913 12470.4 1058400.51
3 15914 NA 35465 98504 243527 1613.9 380685.42
4 7088 NA 22839 14003 167879 1051.9 300042.60
5 34282 0 45008 299790 145947 2208.0 208850.46
4. 資料視覺化
mx = mxCD; mx[is.na(mx) | mx < 0] = 0
rownames(mx) = mx$chain; mx$chain=NULL
names(mx) = paste0('dept', names(mx))
d3heatmap(log(1+mx), col=rev(spectral))