1. 讀進資料
TX = read.parquet("hdfs:/home/tonychuo/acquire/pq/TX")
2. 資料表格 (結構化資料)
head(TX,10)
           id chain dept category    company brand       date productsize
1  4351032773   191   37     3711  105100050  2820 2012-11-15       15.25
2  4351032773   191   33     3303  102840020 10241 2012-11-15       10.00
3  4351032773   191   37     3706  105010050  8089 2012-11-15       14.00
4  4351032773   191    0        0      10000     0 2012-11-16        0.00
5  4351032773   191   58     5828  107790070  9200 2012-11-16        9.60
6  4351032773   191   56     5620  107283070 18390 2012-11-16       16.00
7  4351032773   191   27     2707 1082927484 11516 2012-11-16       13.50
8  4351032773   191   27     2702  101780010 95301 2012-11-16       16.50
9  4351032773   191   59     5906  104060040 26155 2012-11-16       15.00
10 4351032773   191   95     9518  106700868 25075 2012-11-16        1.00
   productmeasure purchasequantity purchaseamount
1              OZ                1           2.59
2              OZ                1           2.99
3              OZ                1           2.03
4            <NA>                1           0.00
5              OZ                1           3.99
6              OZ                1           5.09
7              LB                1          11.51
8              LB                2          45.66
9              OZ                1           2.97
10             CT                1           4.49
資料欄位、格式
printSchema(TX)
root
 |-- id: long (nullable = true)
 |-- chain: integer (nullable = true)
 |-- dept: integer (nullable = true)
 |-- category: integer (nullable = true)
 |-- company: long (nullable = true)
 |-- brand: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- productsize: double (nullable = true)
 |-- productmeasure: string (nullable = true)
 |-- purchasequantity: integer (nullable = true)
 |-- purchaseamount: double (nullable = true)
資料大小
count(TX)
[1] 349655789
資料概況
TX %>% agg(
  n_id = n_distinct(TX$id), 
  n_chain = n_distinct(TX$chain), 
  n_dept = n_distinct(TX$dept),
  n_category = n_distinct(TX$category),
  n_brand = n_distinct(TX$brand)
  ) %>% collect
    n_id n_chain n_dept n_category n_brand
1 311541     134     83        836   35689
3. 資料彙整
mxCD = sum(pivot(groupBy(TX,"chain"),"dept"),"purchaseamount") %>% collect 
sparkR.session.stop() # disconnect from Spark
dim(mxCD)
[1] 134  84
head(mxCD,5)
  chain         0        1        2         3      4        5      6        7
1   392    -32.39   2760.8    299.6    782.65  33872   6887.2   9848   6325.4
2    31   5638.81 214667.1 108830.7 127093.08 412373 798062.6 249889 481776.9
3    85  42662.42  69284.2  67079.9  59267.10 135816 372501.3  64452 198151.4
4    65 -13384.33  37074.6  53812.0  55219.92 128390 288490.7  44887 180507.5
5   133  -9769.70  31743.1  34163.2  50032.29 132509 239754.4  43712 105679.4
         8       9        10       11        12       13        14       15
1   1009.6   10163    263.65   1580.1    861.13    34.45    248.12   662.66
2 700112.9 2259195 172472.12 130178.1 130442.94 60450.09 104130.58 49651.31
3 267450.4  943426  68292.48  55993.9  57666.30 37354.24  77625.21 15594.44
4 214341.7  784570  65302.01  56646.6  35889.29 19083.68  54664.60 14196.24
5 127047.8  473862  40022.06  29166.1  20558.69  9429.08  40888.41 12992.61
       16       17     18       19       20       21       22        23
1  3733.7   3909.7  18539   1150.5   260.86   8370.4   2444.9     73.19
2 78072.4 144166.8 443615 215318.1 52500.82 894716.9 157686.0 152642.62
3 23961.5  71193.8 147072  80125.1 20429.08 270978.5  67714.8  54870.75
4 17093.9  60218.7 117057  67045.7 17402.25 203122.5  60833.9  46758.21
5 14987.2  38086.6  98852  47961.4 14103.83 104500.3  42706.6  32444.84
        24       25      26       27        28       29        30       31
1   343.68   1789.9   44299   8682.2    283.76   1196.2    468.82    172.5
2 98639.68 365893.7 1235240 736578.6 142385.17 562177.8 337199.63 225491.6
3 54893.76 130967.5  610343 444188.9  58910.95 248728.0 144551.62  74017.7
4 32105.04 118882.6  503511 394494.1  40198.42 195813.1 101799.16  57138.8
5 10600.95  65071.3  317188 173503.8  32764.77 106089.4  54132.32  39948.8
        32      33     34       35      36       37        38       39       40
1    186.3   18893  16689   7760.1   22411   1913.4    411.75    30.08   316.24
2 251740.1 1341723 125330 379395.9 2164925 282817.2 123635.67 43223.69 78561.68
3  89405.0  507796  55103 215384.0  783295 175828.8  81527.21 20047.77 37133.33
4  72232.9  447859  45848 181149.4  687598 115679.1  47367.25 15575.65 26695.80
5  31067.4  270931  38995 103859.7  316273  80307.9  18770.89 10603.34 15612.14
         41      42       43        44        45      46      47        49
1    239.85      NA    80.37     18.03    605.37      NA      NA    154.62
2 391026.25 36747.1 41554.86 150608.42 847829.17 45201.2 16437.5 127472.84
3 208502.48 14171.3 20358.78  88399.39 348248.07 12082.9  6025.8  60401.20
4 143766.59  6772.7 17034.13  76900.45 232277.13  9816.7  4247.3  52775.81
5  85002.28  3910.0 10718.72  31356.01 126961.91 12455.0  9967.8  25661.98
        50       51     53        54       55        56        57        58
1   124.78   4204.1     NA    390.07   4333.6    1320.6    335.24    1968.4
2 27447.93 957983.1 202207 186164.68 733628.2 1198395.8 369486.16 1646493.8
3 16843.89 548470.0 105220  94431.28 277246.8  447608.9 122382.43  835174.6
4 12686.67 442044.2  86524  88766.11 228970.3  342743.8  99015.77  599248.6
5  6709.02 251099.7  44456  42027.54 113538.7  289438.4  93363.72  378984.4
         59        60    62        63      64       65      67      69      70
1    392.07     73.93    NA    8663.8   10644     7.18  2044.6  8761.2 12408.1
2 271850.84 119583.85 88669 2117395.9 5157154 22472.73 23712.7 77119.1 63274.7
3  94306.29  75087.35 26414  706086.9  587316 13388.28  6053.9 29265.9 13963.7
4  75728.03  61623.72 23550  581404.1  542300 12673.39  4160.8 20986.8  9844.8
5  53175.31  36513.37 16012  391630.3   57320  6076.52  6435.3 18485.7 11375.1
      71     72     73     74     75     78      79     81      82         91
1  26004  31536 115774  17141  25616 2522.9  3923.5  30496      NA     319.87
2 195735 199660 513642 113054 245012 3587.8 86720.7 629304 64303.2 5735899.64
3 129533  69098 198573  37467  31861 3024.7 27982.0 185839  4814.2    4672.11
4  46329  51354 118793  25446  24263 1398.8 26520.1 387960  3069.6    5006.08
5  47732  46572 212787  28856  29961 2164.1 21746.7  84934 55531.2    6709.47
      92 94    95     96     97      98         99
1  52787 NA 19561  10031  27984  1663.3      24.94
2 108586 NA 84278 174534 698913 12470.4 1058400.51
3  15914 NA 35465  98504 243527  1613.9  380685.42
4   7088 NA 22839  14003 167879  1051.9  300042.60
5  34282  0 45008 299790 145947  2208.0  208850.46
4. 資料視覺化
mx = mxCD; mx[is.na(mx) | mx < 0] = 0
rownames(mx) = mx$chain; mx$chain=NULL
names(mx) = paste0('dept', names(mx))
d3heatmap(log(1+mx), col=rev(spectral))