Using hourly weather data from 2012-08-15 to 2015-08-14 to illustrate features of data.table.
Special symbols in this data:
- T: trace amount of precipitation
- x: instrument error
- V: wind direction is uncertain
Using hourly weather data from 2012-08-15 to 2015-08-14 to illustrate features of data.table.
Special symbols in this data:
Here we use data.table v1.9.5
devtools::install_github("Rdatatable/data.table")
library(data.table)
DT[i, j, by]
| R: | i | j | by |
|---|---|---|---|
| SQL: | where | select/update | group by |
Take DT, subset rows using i, then calculate j, grouped by by.
filepaths = list.files("data/daily", full.names = TRUE)
locations = gsub("data\\/daily\\/|_....-..-..\\.csv", "", filepaths)
dates = gsub("data\\/daily\\/.*_|\\.csv", "", filepaths)
weather = list()
for (i in 1:length(filepaths)) {
weather[[i]] = fread(filepaths[i])
weather[[i]][, city := locations[i]]
weather[[i]][, date := dates[i]]
}
weather = rbindlist(weather)
You can use data.table with any function that suitable for data.frame.
class(weather)
## [1] "data.table" "data.frame"
get_dt = function() {
dt = list()
for (i in 1:length(filepaths)) {
dt[[i]] = fread(filepaths[i])
dt[[i]][, city := locations[i]]
dt[[i]][, date := dates[i]]
}
rbindlist(dt)
}
get_df = function() {
df = list()
for (i in 1:length(filepaths)) {
df[[i]] = read.csv(filepaths[i])
df[[i]]$city = locations[i]
df[[i]]$date = dates[i]
}
do.call(rbind, df)
}Here we use rbenchmark to test above two functions
library(rbenchmark)
within(benchmark(get_dt(), get_df(), replications = 3,
columns=c('test', 'replications', 'elapsed', "relative")),
{ average = elapsed/replications })
## test replications elapsed relative average ## 2 get_df() 3 159.676 2.62 53.22533 ## 1 get_dt() 3 60.956 1.00 20.31867
Here we use lubridate::ymd_h to generate datetime. The warning message is due to the missing data of Kaohsiung in 2015-06-24
weather[,datetime := lubridate::ymd_h(paste(date, ObsTime), tz = "CST")]
## Warning: 2 failed to parse.
weather[is.na(datetime)]
## ObsTime StnPres SeaPres ## 1: NA NA NA ## 2: 觀測時間(LST)ObsTime 測站氣壓(hPa)StnPres 海平面氣壓(hPa)SeaPres ## Temperature Tddewpoint RH WS ## 1: NA NA NA NA ## 2: 氣溫(℃)Temperature 露點溫度(℃)Td dew point 相對溼度(%)RH 風速(m/s)WS ## WD WSGust ## 1: NA NA ## 2: 風向(最多風向)(360degree)WD 最大陣風(m/s)WSGust ## WDGust Precp PrecpHour ## 1: NA NA NA ## 2: 最大陣風風向(360degree)WDGust 降水量(mm)Precp 降水時數(hr)PrecpHour ## SunShine GloblRad Visb ## 1: NA NA NA ## 2: 日照時數(hr)SunShine 全天空日射量(MJ/㎡)GloblRad 能見度(km)Visb ## city date datetime ## 1: Kaohsiung 2015-06-24 <NA> ## 2: Kaohsiung 2015-06-24 <NA>
weather = weather[!is.na(datetime),] str(weather)
## Classes 'data.table' and 'data.frame': 105096 obs. of 18 variables: ## $ ObsTime : chr "1" "2" "3" "4" ... ## $ StnPres : chr "1002.9 " "1001.5 " "1002.6 " "1002.3 " ... ## $ SeaPres : chr "1005.1 " "1003.7 " "1004.8 " "1004.5 " ... ## $ Temperature: chr "28.6 " "28.3 " "28.3 " "28.1 " ... ## $ Tddewpoint : chr "21.1 " "22.1 " "21.6 " "21.7 " ... ## $ RH : chr "64 " "69 " "67 " "68 " ... ## $ WS : chr "1.3 " "3.2 " "2.5 " "1.1 " ... ## $ WD : chr "340.0 " "20.0 " "330.0 " "330.0 " ... ## $ WSGust : chr "2.7 " "5.8 " "4.6 " "4.2 " ... ## $ WDGust : chr "310.0 " "40.0 " "20.0 " "320.0 " ... ## $ Precp : chr "0.0 " "0.0 " "0.0 " "0.0 " ... ## $ PrecpHour : chr "0.0 " "0.0 " "0.0 " "0.0 " ... ## $ SunShine : chr " " " " " " " " ... ## $ GloblRad : chr "0.00 " "0.00 " "0.00 " "0.00 " ... ## $ Visb : chr " " "10.0 " " " " " ... ## $ city : chr "Hualien" "Hualien" "Hualien" "Hualien" ... ## $ date : chr "2012-08-15" "2012-08-15" "2012-08-15" "2012-08-15" ... ## $ datetime : POSIXct, format: "2012-08-15 01:00:00" "2012-08-15 02:00:00" ... ## - attr(*, ".internal.selfref")=<externalptr>
numeric_cols = colnames(weather)[!colnames(weather) %in% c("city", "date", "datetime")]
weather[, c(numeric_cols) := (lapply(.SD, function(x){gsub("[^0-9.TxV]", "", x)})),
.SDcols = numeric_cols]
You can update partial cells in a data.table.
# Check which numeric columns contain special symbols
weather[, lapply(.SD, function(x){sum(grepl("^[TxV]$", x), na.rm = FALSE)}),
.SDcol = numeric_cols]
## ObsTime StnPres SeaPres Temperature Tddewpoint RH WS WD WSGust WDGust ## 1: 0 0 0 0 0 0 0 209 0 0 ## Precp PrecpHour SunShine GloblRad Visb ## 1: 3884 0 0 0 0
weather[WD == "V", WD := ""] weather[Precp == "T", Precp := "0"]
weather[, `:=`(month = substr(date, 6, 7),
isRain = Precp > 0
)]
weather[, c(numeric_cols) := (lapply(.SD, as.numeric)), .SDcols = numeric_cols]
weather[, date := as.Date(date)]
# Select columns with list
ws = weather[, .(city, datetime, Temperature, Tddewpoint)]
# Select columns with character vector
attrs = c("city", "datetime", "Temperature", "Tddewpoint")
ws2 = weather[, attrs, with = FALSE]
identical(ws, ws2)
## [1] TRUE
hottest = ws[, .(datetime, Temperature, T.Max = max(Temperature)), keyby = city]
hottest = hottest[, count := .N]
hottest = hottest[Temperature == T.Max,
head(.SD[, .(T.Max, datetime)], 3), # output top 3 to check ties
by = city]
hottest
## city T.Max datetime ## 1: Hualien 35.1 2013-07-12 11:00:00 ## 2: Kaohsiung 36.9 2014-09-15 12:00:00 ## 3: Taichung 36.5 2014-09-15 14:00:00 ## 4: Taipei 38.3 2013-08-08 14:00:00
chaining: DT[ … ][ … ][ … ]
Avoid intermediate assignment.
hottest2 = ws[, .(datetime, Temperature, T.Max = max(Temperature)), keyby = city
][, count := .N
][Temperature == T.Max,
head(.SD[, .(T.Max, datetime)], 3), # output top 3 to check ties
by = city]
key(hottest2)
## [1] "city"
identical(hottest, hottest2)
## [1] TRUE
# cj = CJ(ws$city, ws$datetime, unique = TRUE) # don't use POSIXct in CJ
cj = CJ(ws$city, as.character(ws$datetime), unique = TRUE)[
, V2 := lubridate::ymd_hms(V2, tz = "CST")]
setnames(cj, names(cj), c("city", "datetime"))
ws = merge(cj, ws, all.x = TRUE, all.y = FALSE, by = c("city", "datetime"))
ws[, date := as.character(datetime)][,
`:=`(date = as.Date(substr(date, 1, 10)),
year = substr(date, 1, 4),
month = substr(date, 6, 7),
hour = substr(date, 12, 13))]
ws[city == "Kaohsiung" & date == as.Date("2015-06-24"), ]
## city datetime Temperature Tddewpoint date year ## 1: Kaohsiung 2015-06-24 00:00:00 29.4 24.7 2015-06-24 2015 ## 2: Kaohsiung 2015-06-24 01:00:00 NA NA 2015-06-24 2015 ## 3: Kaohsiung 2015-06-24 02:00:00 NA NA 2015-06-24 2015 ## 4: Kaohsiung 2015-06-24 03:00:00 NA NA 2015-06-24 2015 ## 5: Kaohsiung 2015-06-24 04:00:00 NA NA 2015-06-24 2015 ## 6: Kaohsiung 2015-06-24 05:00:00 NA NA 2015-06-24 2015 ## 7: Kaohsiung 2015-06-24 06:00:00 NA NA 2015-06-24 2015 ## 8: Kaohsiung 2015-06-24 07:00:00 NA NA 2015-06-24 2015 ## 9: Kaohsiung 2015-06-24 08:00:00 NA NA 2015-06-24 2015 ## 10: Kaohsiung 2015-06-24 09:00:00 NA NA 2015-06-24 2015 ## 11: Kaohsiung 2015-06-24 10:00:00 NA NA 2015-06-24 2015 ## 12: Kaohsiung 2015-06-24 11:00:00 NA NA 2015-06-24 2015 ## 13: Kaohsiung 2015-06-24 12:00:00 NA NA 2015-06-24 2015 ## 14: Kaohsiung 2015-06-24 13:00:00 NA NA 2015-06-24 2015 ## 15: Kaohsiung 2015-06-24 14:00:00 NA NA 2015-06-24 2015 ## 16: Kaohsiung 2015-06-24 15:00:00 NA NA 2015-06-24 2015 ## 17: Kaohsiung 2015-06-24 16:00:00 NA NA 2015-06-24 2015 ## 18: Kaohsiung 2015-06-24 17:00:00 NA NA 2015-06-24 2015 ## 19: Kaohsiung 2015-06-24 18:00:00 NA NA 2015-06-24 2015 ## 20: Kaohsiung 2015-06-24 19:00:00 NA NA 2015-06-24 2015 ## 21: Kaohsiung 2015-06-24 20:00:00 NA NA 2015-06-24 2015 ## 22: Kaohsiung 2015-06-24 21:00:00 NA NA 2015-06-24 2015 ## 23: Kaohsiung 2015-06-24 22:00:00 NA NA 2015-06-24 2015 ## 24: Kaohsiung 2015-06-24 23:00:00 NA NA 2015-06-24 2015 ## city datetime Temperature Tddewpoint date year ## month hour ## 1: 06 00 ## 2: 06 01 ## 3: 06 02 ## 4: 06 03 ## 5: 06 04 ## 6: 06 05 ## 7: 06 06 ## 8: 06 07 ## 9: 06 08 ## 10: 06 09 ## 11: 06 10 ## 12: 06 11 ## 13: 06 12 ## 14: 06 13 ## 15: 06 14 ## 16: 06 15 ## 17: 06 16 ## 18: 06 17 ## 19: 06 18 ## 20: 06 19 ## 21: 06 20 ## 22: 06 21 ## 23: 06 22 ## 24: 06 23 ## month hour
inter_mean = ws[year == "2015" & month == "06" & city == "Kaohsiung",
.(meanT = mean(Temperature, na.rm = TRUE),
meanTd = mean(Tddewpoint, na.rm = TRUE)),
by = hour]
setkey(inter_mean, hour)
Tj = which(colnames(ws) == "Temperature")
Tdj = which(colnames(ws) == "Tddewpoint")
for (i in which(is.na(ws$Temperature))) {
set(ws, i, Tj, inter_mean[ws[i, hour], meanT])
set(ws, i, Tdj, inter_mean[ws[i, hour], meanTd])
}
ws[city == "Kaohsiung" & date == as.Date("2015-06-24"), ]
## city datetime Temperature Tddewpoint date year ## 1: Kaohsiung 2015-06-24 00:00:00 29.40000 24.70000 2015-06-24 2015 ## 2: Kaohsiung 2015-06-24 01:00:00 29.40345 25.12069 2015-06-24 2015 ## 3: Kaohsiung 2015-06-24 02:00:00 29.24138 24.95172 2015-06-24 2015 ## 4: Kaohsiung 2015-06-24 03:00:00 28.99655 24.77586 2015-06-24 2015 ## 5: Kaohsiung 2015-06-24 04:00:00 28.80690 24.67586 2015-06-24 2015 ## 6: Kaohsiung 2015-06-24 05:00:00 28.57241 24.51379 2015-06-24 2015 ## 7: Kaohsiung 2015-06-24 06:00:00 28.65517 24.47586 2015-06-24 2015 ## 8: Kaohsiung 2015-06-24 07:00:00 29.74483 25.23103 2015-06-24 2015 ## 9: Kaohsiung 2015-06-24 08:00:00 30.87586 25.63448 2015-06-24 2015 ## 10: Kaohsiung 2015-06-24 09:00:00 31.35172 25.98276 2015-06-24 2015 ## 11: Kaohsiung 2015-06-24 10:00:00 31.82414 26.42069 2015-06-24 2015 ## 12: Kaohsiung 2015-06-24 11:00:00 32.22069 26.47586 2015-06-24 2015 ## 13: Kaohsiung 2015-06-24 12:00:00 32.45172 26.61034 2015-06-24 2015 ## 14: Kaohsiung 2015-06-24 13:00:00 32.60000 26.65172 2015-06-24 2015 ## 15: Kaohsiung 2015-06-24 14:00:00 32.62759 26.67241 2015-06-24 2015 ## 16: Kaohsiung 2015-06-24 15:00:00 32.14828 26.61379 2015-06-24 2015 ## 17: Kaohsiung 2015-06-24 16:00:00 32.03448 26.45862 2015-06-24 2015 ## 18: Kaohsiung 2015-06-24 17:00:00 31.53793 26.16897 2015-06-24 2015 ## 19: Kaohsiung 2015-06-24 18:00:00 30.85517 25.68276 2015-06-24 2015 ## 20: Kaohsiung 2015-06-24 19:00:00 30.33793 25.39310 2015-06-24 2015 ## 21: Kaohsiung 2015-06-24 20:00:00 30.11034 25.28276 2015-06-24 2015 ## 22: Kaohsiung 2015-06-24 21:00:00 29.98966 25.26207 2015-06-24 2015 ## 23: Kaohsiung 2015-06-24 22:00:00 29.79310 25.23448 2015-06-24 2015 ## 24: Kaohsiung 2015-06-24 23:00:00 29.74483 25.30345 2015-06-24 2015 ## city datetime Temperature Tddewpoint date year ## month hour ## 1: 06 00 ## 2: 06 01 ## 3: 06 02 ## 4: 06 03 ## 5: 06 04 ## 6: 06 05 ## 7: 06 06 ## 8: 06 07 ## 9: 06 08 ## 10: 06 09 ## 11: 06 10 ## 12: 06 11 ## 13: 06 12 ## 14: 06 13 ## 15: 06 14 ## 16: 06 15 ## 17: 06 16 ## 18: 06 17 ## 19: 06 18 ## 20: 06 19 ## 21: 06 20 ## 22: 06 21 ## 23: 06 22 ## 24: 06 23 ## month hour
In this example, all values are concatenate into single column.
ws[, .(coef(lm(Tddewpoint ~ Temperature))), by = city]
## city V1 ## 1: Hualien -2.7831813 ## 2: Hualien 0.9292042 ## 3: Kaohsiung -5.8939668 ## 4: Kaohsiung 1.0290631 ## 5: Taichung -1.2627112 ## 6: Taichung 0.8169720 ## 7: Taipei -2.0428652 ## 8: Taipei 0.8500687
As long as j returns a list, each element of the list will become a column in the resulting data.table.
ws[, as.list(coef(lm(Tddewpoint ~ Temperature))), by = city]
## city (Intercept) Temperature ## 1: Hualien -2.783181 0.9292042 ## 2: Kaohsiung -5.893967 1.0290631 ## 3: Taichung -1.262711 0.8169720 ## 4: Taipei -2.042865 0.8500687
Sort Temperature in ascending and Tddewpoint in descending
ws[order(Temperature, -Tddewpoint)]
## city datetime Temperature Tddewpoint date ## 1: Taichung 2014-02-21 06:00:00 6.8 3.7 2014-02-21 ## 2: Taichung 2014-02-21 05:00:00 7.2 4.5 2014-02-21 ## 3: Taichung 2014-02-21 04:00:00 7.4 2.9 2014-02-21 ## 4: Taichung 2014-01-23 05:00:00 7.6 5.3 2014-01-23 ## 5: Taichung 2014-01-23 07:00:00 7.7 5.2 2014-01-23 ## --- ## 105116: Taipei 2014-08-02 12:00:00 37.4 24.3 2014-08-02 ## 105117: Taipei 2014-07-09 13:00:00 37.5 25.9 2014-07-09 ## 105118: Taipei 2013-08-08 15:00:00 37.7 25.0 2013-08-08 ## 105119: Taipei 2013-08-08 13:00:00 37.8 24.4 2013-08-08 ## 105120: Taipei 2013-08-08 14:00:00 38.3 25.4 2013-08-08 ## year month hour ## 1: 2014 02 06 ## 2: 2014 02 05 ## 3: 2014 02 04 ## 4: 2014 01 05 ## 5: 2014 01 07 ## --- ## 105116: 2014 08 12 ## 105117: 2014 07 13 ## 105118: 2013 08 15 ## 105119: 2013 08 13 ## 105120: 2013 08 14
order_dt = function(ws) {
ws[order(city, datetime)]
ws[order(Temperature, -Tddewpoint)]
}
order_base = function(ws) {
ws[base::order(city, datetime)]
ws[base::order(Temperature, -Tddewpoint)]
}Here we use rbenchmark to test above two functions
within(benchmark(order_dt_out <- order_dt(ws),
order_base_out <- order_base(ws),
replications = 10,
columns=c('test', 'replications', 'elapsed', "relative")),
{ average = elapsed/replications })
## test replications elapsed relative average ## 2 order_base_out <- order_base(ws) 10 1.912 4.585 0.1912 ## 1 order_dt_out <- order_dt(ws) 10 0.417 1.000 0.0417
identical(order_dt_out, order_base_out)
## [1] TRUE
:= and set* modify the data.table by references. You can use copy to do a deep copy to avoid side effect(modify a object within a function).
deepf = function(dt) {
dt = copy(dt)
dt[, city := substr(city, 1, 1)]
}
deepf(hottest2)
hottest2
## city T.Max datetime ## 1: Hualien 35.1 2013-07-12 11:00:00 ## 2: Kaohsiung 36.9 2014-09-15 12:00:00 ## 3: Taichung 36.5 2014-09-15 14:00:00 ## 4: Taipei 38.3 2013-08-08 14:00:00
shallowf = function(dt) {
dt[, city := substr(city, 1, 1)]
}
shallowf(hottest2)
hottest2
## city T.Max datetime ## 1: H 35.1 2013-07-12 11:00:00 ## 2: K 36.9 2014-09-15 12:00:00 ## 3: T 36.5 2014-09-15 14:00:00 ## 4: T 38.3 2013-08-08 14:00:00
data.table implement melt in reshape2. Additionally, you can use regular expression with patterns to melt multiple columns.
melted = melt.data.table(weather,
id.vars = c("city", "datetime"),
measure = patterns(".*Pres", "Precp"),
value.name = c("Pres", "Precp")
)
melted[c(1,2,.N-1,.N),]
## city datetime variable Pres Precp ## 1: Hualien 2012-08-15 01:00:00 1 1002.9 0 ## 2: Hualien 2012-08-15 02:00:00 1 1001.5 0 ## 3: Taipei 2015-08-14 23:00:00 2 1008.7 0 ## 4: Taipei 2015-08-15 00:00:00 2 1008.4 0
You can also use dcast with data.table. Furthermore, you can cast multiple value.vars simultaneously.
dcasted = dcast.data.table(melted, city + datetime ~ variable,
value.var = c("Pres", "Precp"))
dcasted[c(1,2,.N-1,.N),]
## city datetime Pres_1 Pres_2 Precp_1 Precp_2 ## 1: Hualien 2012-08-15 01:00:00 1002.9 1005.1 0 0 ## 2: Hualien 2012-08-15 02:00:00 1001.5 1003.7 0 0 ## 3: Taipei 2015-08-14 23:00:00 1005.2 1008.7 0 0 ## 4: Taipei 2015-08-15 00:00:00 1004.9 1008.4 0 0