犯罪是一個國際關注的問題,但它在不同的國家以不同的方式記錄和處理。 在美國,聯邦調查局(FBI)記錄了暴力犯罪和財產犯罪。 此外,每個城市都記錄了犯罪行為,一些城市發布了有關犯罪率的數據。 伊利諾伊州芝加哥市從2001年開始在線發布犯罪數據。

芝加哥是美國人口第三多的城市,人口超過270萬。在這個作業裡面,我們將關注一種特定類型的財產犯罪,稱為「汽車盜竊」,我們將使用R中的一些基本數據分析來了解芝加哥的汽車盜竊紀錄。請載入文件“data/mvtWeek1.csv”:以下是各欄位的描述:



Section-1 Loading the Data

【1.1】How many rows of data (observations) are in this dataset?

D = read.csv("../data/mvtWeek1.csv", stringsAsFactors=F)
nrow(D)
[1] 191641
ncol(D)
[1] 11

檢查各欄位的資料格式

summary(D)
       ID              Date           LocationDescription   Arrest         Domestic      
 Min.   :1310022   Length:191641      Length:191641       Mode :logical   Mode :logical  
 1st Qu.:2832144   Class :character   Class :character    FALSE:176105    FALSE:191226   
 Median :4762956   Mode  :character   Mode  :character    TRUE :15536     TRUE :415      
 Mean   :4968629                                                                         
 3rd Qu.:7201878                                                                         
 Max.   :9181151                                                                         
                                                                                         
      Beat         District     CommunityArea        Year         Latitude   
 Min.   : 111   Min.   : 1      Min.   : 0      Min.   :2001   Min.   :41.6  
 1st Qu.: 722   1st Qu.: 6      1st Qu.:22      1st Qu.:2003   1st Qu.:41.8  
 Median :1121   Median :10      Median :32      Median :2006   Median :41.9  
 Mean   :1259   Mean   :12      Mean   :38      Mean   :2006   Mean   :41.8  
 3rd Qu.:1733   3rd Qu.:17      3rd Qu.:60      3rd Qu.:2009   3rd Qu.:41.9  
 Max.   :2535   Max.   :31      Max.   :77      Max.   :2012   Max.   :42.0  
                NA's   :43056   NA's   :24616                  NA's   :2276  
   Longitude    
 Min.   :-87.9  
 1st Qu.:-87.7  
 Median :-87.7  
 Mean   :-87.7  
 3rd Qu.:-87.6  
 Max.   :-87.5  
 NA's   :2276   

類別(Factor) versus 字串(Character)

【1.2】How many variables are in this dataset?

ncol(D)
[1] 11

【1.3】Using the “max” function, what is the maximum value of the variable “ID”?

max(D$ID)
[1] 9181151

【1.4】 What is the minimum value of the variable “Beat”?

min(D$Beat)
[1] 111

【1.5】 How many observations have value TRUE in the Arrest variable (this is the number of crimes for which an arrest was made)?

sum(D$Arrest)
[1] 15536
mean(D$Arrest)
[1] 0.08107

【1.6】 How many observations have a LocationDescription value of ALLEY?

sum(D$LocationDescription == "ALLEY")
[1] 2308



Section-2 Understanding Dates in R

【2.1】 In what format are the entries in the variable Date?

head(D$Date)  # Month/Day/Year Hour:Minute
[1] "12/31/12 23:15" "12/31/12 22:00" "12/31/12 22:00" "12/31/12 22:00" "12/31/12 21:30"
[6] "12/31/12 20:30"
ts = as.POSIXct(D$Date, format="%m/%d/%y %H:%M")
par(cex=0.7)
hist(ts,"year",las=2,freq=T,xlab="")

table(format(ts,'%w'))

    0     1     2     3     4     5     6 
26316 27397 26791 27416 27319 29284 27118 
table(format(ts,'%m'))

   01    02    03    04    05    06    07    08    09    10    11    12 
16047 13511 15758 15280 16035 16002 16801 16572 16060 17086 16063 16426 
table(weekday=format(ts,'%w'), month=format(ts,'%m'))
       month
weekday   01   02   03   04   05   06   07   08   09   10   11   12
      0 2110 1837 2075 2070 2168 2239 2339 2304 2352 2424 2254 2144
      1 2395 1937 2200 2323 2359 2187 2457 2288 2258 2399 2323 2271
      2 2317 1885 2270 2118 2222 2183 2412 2251 2142 2416 2258 2317
      3 2259 2007 2242 2060 2345 2347 2408 2428 2239 2484 2182 2415
      4 2334 1904 2263 2099 2402 2190 2385 2464 2320 2280 2253 2425
      5 2392 2036 2443 2388 2340 2566 2459 2591 2390 2692 2475 2512
      6 2240 1905 2265 2222 2199 2290 2341 2246 2359 2391 2318 2342
table(format(ts,"%u"), format(ts,"%H")) %>% 
  as.data.frame.matrix %>% 
  d3heatmap(F,F,col=colorRamp(c('seagreen','lightyellow','red')))

【2.2】 What is the month and year of the median date in our dataset?

median(ts)
[1] "2006-05-21 12:30:00 CST"

【2.3】 In which month did the fewest motor vehicle thefts occur?

sort(table(format(ts,"%m")))

   02    04    03    06    05    01    09    11    12    08    07    10 
13511 15280 15758 16002 16035 16047 16060 16063 16426 16572 16801 17086 

【2.4】 On which weekday did the most motor vehicle thefts occur?

format(ts,"%w") %>% table %>% sort
.
    0     2     6     4     1     3     5 
26316 26791 27118 27319 27397 27416 29284 

【2.5】 Which month has the largest number of motor vehicle thefts for which an arrest was made?

ts[D$Arrest] %>% format('%m') %>% table %>% sort
.
  05   06   02   09   04   11   03   07   08   10   12   01 
1187 1230 1238 1248 1252 1256 1298 1324 1329 1342 1397 1435 



Section-5 TOP5 7x24 案件數 & 破案率

top5
[1] "DRIVEWAY - RESIDENTIAL"         "GAS STATION"                   
[3] "ALLEY"                          "PARKING LOT/GARAGE(NON.RESID.)"
[5] "STREET"                        
ts5 = ts[D$Location %in% top5]
length(ts5) == nrow(TOP5)       # length of ts5 should equals nrow(TOP5)
[1] TRUE
weekday = format(ts5,"%u")
hour = format(ts5,"%H")
table(weekday, hour)
       hour
weekday   00   01   02   03   04   05   06   07   08   09   10   11   12   13   14   15
      1 1785  777  664  494  388  502  706 1029 1211 1109  863  655 1005  742  859  954
      2 1589  725  561  440  376  480  791 1040 1085 1071  856  707 1006  688  817  962
      3 1707  739  572  437  368  525  792 1070 1211 1108  857  686 1103  723  780  970
      4 1751  775  657  485  379  504  759 1040 1175 1171  819  643  961  671  743  952
      5 1764  875  696  523  436  557  771 1112 1163 1161  850  736 1087  774  851 1025
      6 1951 1201  931  788  611  479  515  603  782  949  852  721 1102  693  872  982
      7 1908 1185  972  790  574  434  436  443  557  788  819  738 1067  719  878  951
       hour
weekday   16   17   18   19   20   21   22   23
      1 1036 1158 1405 1405 1524 1709 1898 1410
      2  997 1166 1433 1387 1586 1708 1945 1381
      3  980 1180 1459 1408 1594 1647 1957 1425
      4 1024 1155 1392 1434 1560 1678 2020 1490
      5 1069 1211 1491 1543 1622 1769 2170 1822
      6  966  997 1254 1285 1486 1615 1967 1653
      7  994 1069 1302 1264 1609 1613 1948 1498
tapply(TOP5$Arrest, list(weekday, hour), mean)
       00      01      02      03      04      05      06      07      08      09      10
1 0.06218 0.06178 0.07380 0.07490 0.04897 0.07371 0.06657 0.08649 0.07927 0.06673 0.06837
2 0.06167 0.07862 0.07308 0.06591 0.06915 0.05417 0.07585 0.07019 0.07189 0.06162 0.07477
3 0.06327 0.07984 0.07343 0.07551 0.07609 0.08190 0.10985 0.08879 0.08010 0.08484 0.07118
4 0.06625 0.05290 0.05632 0.06186 0.08179 0.07540 0.06719 0.07500 0.07404 0.08711 0.09035
5 0.06009 0.08229 0.06322 0.08031 0.08028 0.08438 0.07393 0.09173 0.08340 0.06718 0.07765
6 0.07125 0.06744 0.07519 0.07614 0.04910 0.06681 0.09320 0.07794 0.08312 0.07798 0.07981
7 0.07757 0.07342 0.08539 0.07089 0.06794 0.09217 0.10321 0.10384 0.11849 0.10406 0.10501
       11      12      13      14      15      16      17      18      19      20      21
1 0.07176 0.08060 0.10916 0.09895 0.10063 0.08687 0.09585 0.08754 0.09110 0.07480 0.06788
2 0.09194 0.06859 0.10320 0.08690 0.09252 0.08425 0.07290 0.08165 0.08796 0.07503 0.07553
3 0.09621 0.08613 0.10927 0.07692 0.09485 0.09490 0.08136 0.07745 0.08239 0.07340 0.06375
4 0.06532 0.07284 0.08197 0.08075 0.09769 0.09375 0.08745 0.07543 0.07741 0.06538 0.07509
5 0.09375 0.07544 0.07494 0.09518 0.08585 0.09542 0.09414 0.09188 0.08231 0.07522 0.08027
6 0.08460 0.09437 0.09091 0.08716 0.09063 0.10352 0.08225 0.09410 0.09650 0.08008 0.06811
7 0.09214 0.09841 0.09318 0.09112 0.09884 0.09054 0.10009 0.08909 0.08070 0.08763 0.08431
       22      23
1 0.05954 0.07305
2 0.07301 0.07169
3 0.06132 0.06667
4 0.06139 0.06309
5 0.06866 0.07245
6 0.07626 0.07985
7 0.07546 0.06943
L = lapply(top5, function(x) {
  i = TOP5$LocationDescription == x
  tapply(TOP5$Arrest[i], list(weekday[i], hour[i]), mean)
  })

TOP5 7x24: 竊盜案件數量 vs. 破案率

pacman::p_load(manipulateWidget)  # install/load library
color = colorRamp(c('seagreen','lightyellow','red'))  # heatmap color
# put 2 heatmaps in one list object
L = list(as.data.frame.matrix( table(weekday, hour) ),
     tapply(TOP5$Arrest, list(weekday, hour), mean)) %>% 
  lapply(d3heatmap, F, F, col=color, show_grid=F, xaxis_height=8, yaxis_width=8)

# plot the 2 heatmaps side by side
combineWidgets(list=L, ncol=2) 
# make 2 heatmaps for each top5 location
L = lapply(top5, function(loc) {           
  weekday = format(ts5[TOP5$LocationDescription == loc],"%u")
  hour = format(ts5[TOP5$LocationDescription == loc], "%H")
  arrest = TOP5$Arrest[TOP5$LocationDescription == loc]
  list(count = as.data.frame.matrix( table(weekday, hour) ),
       arrest = tapply(arrest, list(weekday, hour), mean)) %>% 
    lapply(d3heatmap, F, F, col=color, show_grid=F, xaxis_height=8, yaxis_width=8)
  })
# plot 10 heatmaps at once
combineWidgets(
  list=do.call(c, L), ncol=2,
  title = "<h4>TOP5 7x24 : 竊盜案件數量 & 破案率</h4>",
  footer= "<p>由上至下:(1)住宅車道、(2)加油站、(3)巷弄、(4)公用停車場、(5)街道</p>"
  )