data.table

2015-10-02

Objective

Using hourly weather data from 2012-08-15 to 2015-08-14 to illustrate features of data.table.

Special symbols in this data:

T: trace amount of precipitation
x: instrument error
V: wind direction is uncertain

Installation

Here we use data.table v1.9.5

devtools::install_github("Rdatatable/data.table")
library(data.table)

What is data.table

An enhanced version of data.frames
Optimize most heavy works in C
Minimize redundant copies

Why data.table

Speed
Automatically optimization
Total solution for ETL
Concise syntax

Benchmarks

Benchmarks : Grouping

General Form

DT[i, j, by]

R:	i	j	by
SQL:	where	select/update	group by

Take DT, subset rows using i, then calculate j, grouped by by.

Get data

fread: read text file into data.table
:=: add/remove/update a column by reference.
rbindlist: Same as do.call("rbind", l) on data.frames, but faster.

filepaths = list.files("data/daily", full.names = TRUE)
locations = gsub("data\\/daily\\/|_....-..-..\\.csv", "", filepaths)
dates = gsub("data\\/daily\\/.*_|\\.csv", "", filepaths)
weather = list()
for (i in 1:length(filepaths)) {
    weather[[i]] = fread(filepaths[i])
    weather[[i]][, city := locations[i]]
    weather[[i]][, date := dates[i]]
}
weather = rbindlist(weather)

data.table is a data.frame, too

You can use data.table with any function that suitable for data.frame.

class(weather)

## [1] "data.table" "data.frame"

Benckmark: read data

get_dt = function() {
    dt = list()
    for (i in 1:length(filepaths)) {
        dt[[i]] = fread(filepaths[i])
        dt[[i]][, city := locations[i]]
        dt[[i]][, date := dates[i]]
    }
    rbindlist(dt)
}

get_df = function() {
    df = list()
    for (i in 1:length(filepaths)) {
        df[[i]] = read.csv(filepaths[i])
        df[[i]]$city = locations[i]
        df[[i]]$date = dates[i]
    }
    do.call(rbind, df)
}

Here we use rbenchmark to test above two functions

library(rbenchmark)
within(benchmark(get_dt(), get_df(), replications = 3, 
                 columns=c('test', 'replications', 'elapsed', "relative")),
       { average = elapsed/replications })

##       test replications elapsed relative  average
## 2 get_df()            3 159.676     2.62 53.22533
## 1 get_dt()            3  60.956     1.00 20.31867

Generate datetime

Here we use lubridate::ymd_h to generate datetime. The warning message is due to the missing data of Kaohsiung in 2015-06-24

weather[,datetime := lubridate::ymd_h(paste(date, ObsTime), tz = "CST")]

## Warning: 2 failed to parse.

weather[is.na(datetime)]

##                 ObsTime              StnPres                SeaPres
## 1:                   NA                   NA                     NA
## 2: 觀測時間(LST)ObsTime 測站氣壓(hPa)StnPres 海平面氣壓(hPa)SeaPres
##           Temperature              Tddewpoint            RH          WS
## 1:                 NA                      NA            NA          NA
## 2: 氣溫(℃)Temperature 露點溫度(℃)Td dew point 相對溼度(%)RH 風速(m/s)WS
##                             WD              WSGust
## 1:                          NA                  NA
## 2: 風向(最多風向)(360degree)WD 最大陣風(m/s)WSGust
##                           WDGust           Precp             PrecpHour
## 1:                            NA              NA                    NA
## 2: 最大陣風風向(360degree)WDGust 降水量(mm)Precp 降水時數(hr)PrecpHour
##                SunShine                    GloblRad           Visb
## 1:                   NA                          NA             NA
## 2: 日照時數(hr)SunShine 全天空日射量(MJ/㎡)GloblRad 能見度(km)Visb
##         city       date datetime
## 1: Kaohsiung 2015-06-24     <NA>
## 2: Kaohsiung 2015-06-24     <NA>

weather = weather[!is.na(datetime),]
str(weather)

## Classes 'data.table' and 'data.frame':   105096 obs. of  18 variables:
##  $ ObsTime    : chr  "1" "2" "3" "4" ...
##  $ StnPres    : chr  "1002.9 " "1001.5 " "1002.6 " "1002.3 " ...
##  $ SeaPres    : chr  "1005.1 " "1003.7 " "1004.8 " "1004.5 " ...
##  $ Temperature: chr  "28.6 " "28.3 " "28.3 " "28.1 " ...
##  $ Tddewpoint : chr  "21.1 " "22.1 " "21.6 " "21.7 " ...
##  $ RH         : chr  "64 " "69 " "67 " "68 " ...
##  $ WS         : chr  "1.3 " "3.2 " "2.5 " "1.1 " ...
##  $ WD         : chr  "340.0 " "20.0 " "330.0 " "330.0 " ...
##  $ WSGust     : chr  "2.7 " "5.8 " "4.6 " "4.2 " ...
##  $ WDGust     : chr  "310.0 " "40.0 " "20.0 " "320.0 " ...
##  $ Precp      : chr  "0.0 " "0.0 " "0.0 " "0.0 " ...
##  $ PrecpHour  : chr  "0.0 " "0.0 " "0.0 " "0.0 " ...
##  $ SunShine   : chr  " " " " " " " " ...
##  $ GloblRad   : chr  "0.00 " "0.00 " "0.00 " "0.00 " ...
##  $ Visb       : chr  " " "10.0 " " " " " ...
##  $ city       : chr  "Hualien" "Hualien" "Hualien" "Hualien" ...
##  $ date       : chr  "2012-08-15" "2012-08-15" "2012-08-15" "2012-08-15" ...
##  $ datetime   : POSIXct, format: "2012-08-15 01:00:00" "2012-08-15 02:00:00" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Strip numeric columns

.SD: Subset of Data.table. It's a data table that holds the data for the current group defined using by.
.SDcols specifies the columns that returned in .SD.

numeric_cols = colnames(weather)[!colnames(weather) %in% c("city", "date", "datetime")]
weather[, c(numeric_cols) := (lapply(.SD, function(x){gsub("[^0-9.TxV]", "", x)})), 
        .SDcols = numeric_cols]

Deal with special symbols

You can update partial cells in a data.table.

# Check which numeric columns contain special symbols
weather[, lapply(.SD, function(x){sum(grepl("^[TxV]$", x), na.rm = FALSE)}), 
        .SDcol = numeric_cols]

##    ObsTime StnPres SeaPres Temperature Tddewpoint RH WS  WD WSGust WDGust
## 1:       0       0       0           0          0  0  0 209      0      0
##    Precp PrecpHour SunShine GloblRad Visb
## 1:  3884         0        0        0    0

weather[WD == "V", WD := ""]
weather[Precp == "T", Precp := "0"]

More data manipulation

:= also can modify multiple columns by reference in a functional way

weather[, `:=`(month = substr(date, 6, 7),
               isRain = Precp > 0
               )]
weather[, c(numeric_cols) := (lapply(.SD, as.numeric)), .SDcols = numeric_cols]
weather[, date := as.Date(date)]

Select columns

. is an abbreviation of list within data.table
Because j handle expressions, you need to set with = FALSE when slicing with character vector

# Select columns with list
ws = weather[, .(city, datetime, Temperature, Tddewpoint)]

# Select columns with character vector
attrs = c("city", "datetime", "Temperature", "Tddewpoint")
ws2 = weather[, attrs, with = FALSE]
identical(ws, ws2)

## [1] TRUE

Hottest date and Temperature in each city

You can use by to summarize by each group. keyby will add key automatically.
.N: returns the number of rows in the subset.

hottest = ws[, .(datetime, Temperature, T.Max = max(Temperature)), keyby = city]
hottest = hottest[, count := .N]
hottest = hottest[Temperature == T.Max, 
                  head(.SD[, .(T.Max, datetime)], 3), # output top 3 to check ties 
                  by = city]
hottest

##         city T.Max            datetime
## 1:   Hualien  35.1 2013-07-12 11:00:00
## 2: Kaohsiung  36.9 2014-09-15 12:00:00
## 3:  Taichung  36.5 2014-09-15 14:00:00
## 4:    Taipei  38.3 2013-08-08 14:00:00

Hottest date and Temperature in each city - 2

chaining: DT[ … ][ … ][ … ]

Avoid intermediate assignment.

hottest2 = ws[, .(datetime, Temperature, T.Max = max(Temperature)), keyby = city
              ][, count := .N
                ][Temperature == T.Max, 
                  head(.SD[, .(T.Max, datetime)], 3), # output top 3 to check ties 
                  by = city]
key(hottest2)

## [1] "city"

identical(hottest, hottest2)

## [1] TRUE

Get Complete dataset

CJ: Cross Join. Generate a data.table from the cross product of the vectors.
setnames: Set attributes of data.table by reference.

# cj = CJ(ws$city, ws$datetime, unique = TRUE)  # don't use POSIXct in CJ
cj = CJ(ws$city, as.character(ws$datetime), unique = TRUE)[
    , V2 := lubridate::ymd_hms(V2, tz = "CST")]
setnames(cj, names(cj), c("city", "datetime"))
ws = merge(cj, ws, all.x = TRUE, all.y = FALSE, by = c("city", "datetime"))
ws[, date := as.character(datetime)][,
    `:=`(date = as.Date(substr(date, 1, 10)),
         year = substr(date, 1, 4),
         month = substr(date, 6, 7),
         hour = substr(date, 12, 13))]

Get Complete dataset - 2

ws[city == "Kaohsiung" & date == as.Date("2015-06-24"), ]

##          city            datetime Temperature Tddewpoint       date year
##  1: Kaohsiung 2015-06-24 00:00:00        29.4       24.7 2015-06-24 2015
##  2: Kaohsiung 2015-06-24 01:00:00          NA         NA 2015-06-24 2015
##  3: Kaohsiung 2015-06-24 02:00:00          NA         NA 2015-06-24 2015
##  4: Kaohsiung 2015-06-24 03:00:00          NA         NA 2015-06-24 2015
##  5: Kaohsiung 2015-06-24 04:00:00          NA         NA 2015-06-24 2015
##  6: Kaohsiung 2015-06-24 05:00:00          NA         NA 2015-06-24 2015
##  7: Kaohsiung 2015-06-24 06:00:00          NA         NA 2015-06-24 2015
##  8: Kaohsiung 2015-06-24 07:00:00          NA         NA 2015-06-24 2015
##  9: Kaohsiung 2015-06-24 08:00:00          NA         NA 2015-06-24 2015
## 10: Kaohsiung 2015-06-24 09:00:00          NA         NA 2015-06-24 2015
## 11: Kaohsiung 2015-06-24 10:00:00          NA         NA 2015-06-24 2015
## 12: Kaohsiung 2015-06-24 11:00:00          NA         NA 2015-06-24 2015
## 13: Kaohsiung 2015-06-24 12:00:00          NA         NA 2015-06-24 2015
## 14: Kaohsiung 2015-06-24 13:00:00          NA         NA 2015-06-24 2015
## 15: Kaohsiung 2015-06-24 14:00:00          NA         NA 2015-06-24 2015
## 16: Kaohsiung 2015-06-24 15:00:00          NA         NA 2015-06-24 2015
## 17: Kaohsiung 2015-06-24 16:00:00          NA         NA 2015-06-24 2015
## 18: Kaohsiung 2015-06-24 17:00:00          NA         NA 2015-06-24 2015
## 19: Kaohsiung 2015-06-24 18:00:00          NA         NA 2015-06-24 2015
## 20: Kaohsiung 2015-06-24 19:00:00          NA         NA 2015-06-24 2015
## 21: Kaohsiung 2015-06-24 20:00:00          NA         NA 2015-06-24 2015
## 22: Kaohsiung 2015-06-24 21:00:00          NA         NA 2015-06-24 2015
## 23: Kaohsiung 2015-06-24 22:00:00          NA         NA 2015-06-24 2015
## 24: Kaohsiung 2015-06-24 23:00:00          NA         NA 2015-06-24 2015
##          city            datetime Temperature Tddewpoint       date year
##     month hour
##  1:    06   00
##  2:    06   01
##  3:    06   02
##  4:    06   03
##  5:    06   04
##  6:    06   05
##  7:    06   06
##  8:    06   07
##  9:    06   08
## 10:    06   09
## 11:    06   10
## 12:    06   11
## 13:    06   12
## 14:    06   13
## 15:    06   14
## 16:    06   15
## 17:    06   16
## 18:    06   17
## 19:    06   18
## 20:    06   19
## 21:    06   20
## 22:    06   21
## 23:    06   22
## 24:    06   23
##     month hour

Interpolate NA with mean

We can subset row by a particular columns after assiggn it as a key with setkey.
set is used to repeatedly update rows and columns by reference.

inter_mean = ws[year == "2015" & month == "06" & city == "Kaohsiung",
                .(meanT = mean(Temperature, na.rm = TRUE),
                  meanTd = mean(Tddewpoint, na.rm = TRUE)),
                by = hour]
setkey(inter_mean, hour)
Tj = which(colnames(ws) == "Temperature")
Tdj = which(colnames(ws) == "Tddewpoint")
for (i in which(is.na(ws$Temperature))) {
    set(ws, i, Tj, inter_mean[ws[i, hour], meanT])
    set(ws, i, Tdj, inter_mean[ws[i, hour], meanTd])
}

Interpolate NA with mean -2

ws[city == "Kaohsiung" & date == as.Date("2015-06-24"), ]

##          city            datetime Temperature Tddewpoint       date year
##  1: Kaohsiung 2015-06-24 00:00:00    29.40000   24.70000 2015-06-24 2015
##  2: Kaohsiung 2015-06-24 01:00:00    29.40345   25.12069 2015-06-24 2015
##  3: Kaohsiung 2015-06-24 02:00:00    29.24138   24.95172 2015-06-24 2015
##  4: Kaohsiung 2015-06-24 03:00:00    28.99655   24.77586 2015-06-24 2015
##  5: Kaohsiung 2015-06-24 04:00:00    28.80690   24.67586 2015-06-24 2015
##  6: Kaohsiung 2015-06-24 05:00:00    28.57241   24.51379 2015-06-24 2015
##  7: Kaohsiung 2015-06-24 06:00:00    28.65517   24.47586 2015-06-24 2015
##  8: Kaohsiung 2015-06-24 07:00:00    29.74483   25.23103 2015-06-24 2015
##  9: Kaohsiung 2015-06-24 08:00:00    30.87586   25.63448 2015-06-24 2015
## 10: Kaohsiung 2015-06-24 09:00:00    31.35172   25.98276 2015-06-24 2015
## 11: Kaohsiung 2015-06-24 10:00:00    31.82414   26.42069 2015-06-24 2015
## 12: Kaohsiung 2015-06-24 11:00:00    32.22069   26.47586 2015-06-24 2015
## 13: Kaohsiung 2015-06-24 12:00:00    32.45172   26.61034 2015-06-24 2015
## 14: Kaohsiung 2015-06-24 13:00:00    32.60000   26.65172 2015-06-24 2015
## 15: Kaohsiung 2015-06-24 14:00:00    32.62759   26.67241 2015-06-24 2015
## 16: Kaohsiung 2015-06-24 15:00:00    32.14828   26.61379 2015-06-24 2015
## 17: Kaohsiung 2015-06-24 16:00:00    32.03448   26.45862 2015-06-24 2015
## 18: Kaohsiung 2015-06-24 17:00:00    31.53793   26.16897 2015-06-24 2015
## 19: Kaohsiung 2015-06-24 18:00:00    30.85517   25.68276 2015-06-24 2015
## 20: Kaohsiung 2015-06-24 19:00:00    30.33793   25.39310 2015-06-24 2015
## 21: Kaohsiung 2015-06-24 20:00:00    30.11034   25.28276 2015-06-24 2015
## 22: Kaohsiung 2015-06-24 21:00:00    29.98966   25.26207 2015-06-24 2015
## 23: Kaohsiung 2015-06-24 22:00:00    29.79310   25.23448 2015-06-24 2015
## 24: Kaohsiung 2015-06-24 23:00:00    29.74483   25.30345 2015-06-24 2015
##          city            datetime Temperature Tddewpoint       date year
##     month hour
##  1:    06   00
##  2:    06   01
##  3:    06   02
##  4:    06   03
##  5:    06   04
##  6:    06   05
##  7:    06   06
##  8:    06   07
##  9:    06   08
## 10:    06   09
## 11:    06   10
## 12:    06   11
## 13:    06   12
## 14:    06   13
## 15:    06   14
## 16:    06   15
## 17:    06   16
## 18:    06   17
## 19:    06   18
## 20:    06   19
## 21:    06   20
## 22:    06   21
## 23:    06   22
## 24:    06   23
##     month hour

Compute regression coefficient in each city

In this example, all values are concatenate into single column.

ws[, .(coef(lm(Tddewpoint ~ Temperature))), by = city]

##         city         V1
## 1:   Hualien -2.7831813
## 2:   Hualien  0.9292042
## 3: Kaohsiung -5.8939668
## 4: Kaohsiung  1.0290631
## 5:  Taichung -1.2627112
## 6:  Taichung  0.8169720
## 7:    Taipei -2.0428652
## 8:    Taipei  0.8500687

Compute regression coefficient in each city - 2

As long as j returns a list, each element of the list will become a column in the resulting data.table.

ws[, as.list(coef(lm(Tddewpoint ~ Temperature))), by = city]

##         city (Intercept) Temperature
## 1:   Hualien   -2.783181   0.9292042
## 2: Kaohsiung   -5.893967   1.0290631
## 3:  Taichung   -1.262711   0.8169720
## 4:    Taipei   -2.042865   0.8500687

Sort

Sort Temperature in ascending and Tddewpoint in descending

order in data.table acts just like base::order, but faster.
- means decending.

ws[order(Temperature, -Tddewpoint)]

##             city            datetime Temperature Tddewpoint       date
##      1: Taichung 2014-02-21 06:00:00         6.8        3.7 2014-02-21
##      2: Taichung 2014-02-21 05:00:00         7.2        4.5 2014-02-21
##      3: Taichung 2014-02-21 04:00:00         7.4        2.9 2014-02-21
##      4: Taichung 2014-01-23 05:00:00         7.6        5.3 2014-01-23
##      5: Taichung 2014-01-23 07:00:00         7.7        5.2 2014-01-23
##     ---                                                               
## 105116:   Taipei 2014-08-02 12:00:00        37.4       24.3 2014-08-02
## 105117:   Taipei 2014-07-09 13:00:00        37.5       25.9 2014-07-09
## 105118:   Taipei 2013-08-08 15:00:00        37.7       25.0 2013-08-08
## 105119:   Taipei 2013-08-08 13:00:00        37.8       24.4 2013-08-08
## 105120:   Taipei 2013-08-08 14:00:00        38.3       25.4 2013-08-08
##         year month hour
##      1: 2014    02   06
##      2: 2014    02   05
##      3: 2014    02   04
##      4: 2014    01   05
##      5: 2014    01   07
##     ---                
## 105116: 2014    08   12
## 105117: 2014    07   13
## 105118: 2013    08   15
## 105119: 2013    08   13
## 105120: 2013    08   14

Benckmark: order

order_dt = function(ws) {
    ws[order(city, datetime)]
    ws[order(Temperature, -Tddewpoint)]
}

order_base = function(ws) {
    ws[base::order(city, datetime)]
    ws[base::order(Temperature, -Tddewpoint)]
}