I have looked around, but could not find another option that would run on my Apple Mac system. So I thought I would have a go at writing my own seasonal adjustment function (do it yourself seasonal adjustment). Before I get to my code, let's look at how it performed.
It appears to be much closer to the ABS benchmark. So on to the code. We will start with the Henderson moving average (which is used a number of times to smooth the series).
## henderson.R
## calculate a Henderson moving average
hma <- function(v, n)
{
hmaSymmetricWeights <- function(n)
{
# caluclate the vector of symmetric weights
# formula from ABS (2003), 'A Guide to Interpreting Time Series', page 41.
# returns a dictionary of symmetric Henderson weights indexed from 1 to n
# calculate the constant denominator
m <- as.integer((n-1)/2)
m1 <- (m+1)*(m+1)
m2 <- (m+2)*(m+2)
d <- as.double(8*(m+2)*(m2-1)*(4*m2-1)*(4*m2-9)*(4*m2-25))
# calculate the weights
w <- rep(NA, n) # 1:n
m3 <- (m+3)*(m+3)
for( j in 0:m ) {
j2 <- j*j
v <- (315*(m1-j2)*(m2-j2)*(m3-j2)*(3*m2-11*j2-16))/d
w[(m+1+j)] <- v
if (j > 0)
w[(m+1-j)] <- v
}
return(w)
}
hmaAsymmetricWeights <- function(n, mw, w)
{
# calculate the asymmetric end-weights
# formula from Mike Doherty (2001), 'The Surrogate Henderson Filters in X-11',
# Aust, NZ J of Stat. 43(4), 2001, pp901-999
# see formula (1) on page 903
# returns a dictionary of asymmetrical weights from 1 to mw;
# where mw is less than n, and
# w is the dictionary of symmetric henderson weights indexed from 1 to n
sumResidual <- sum(w[(mw+1):(n)])
sumEnd <- 0.0
for ( i in (mw+1):n )
sumEnd <- sumEnd + ((i)-((mw+1)/2.0)) * w[i]
ic <- 1.0
if (n >= 13 && n < 15)
ic <- 3.5
if (n >= 15)
ic <- 4.5
b2s2 <- (4.0/pi)/(ic*ic)
f1 <- as.double(sumResidual)/as.double((mw))
u <- 1:mw
for ( r in 1:mw ) {
calc1 <- (r- (mw+1)/2.0) * b2s2
calc2 <- 1 + ( mw*(mw-1)*(mw+1) /12.0 ) * b2s2
u[r] <- w[r] + f1 + ( calc1 / calc2 ) * sumEnd
}
return (u)
}
# --- the main body of the function
# test that n is an integer, odd and >= 5
n <- as.integer(n[1]) # integer; not vectorised
if(! is.integer(n) )
stop("In hma(v, n), n must be an integer")
if(n < 5)
stop("In hma(v, n), n must be >= 5")
if( (n %% 2) == 0)
stop("In hma(v, n), n must be odd")
# confirm that v is a vector
if ( !(is.vector(v) && is.atomic(v) && is.numeric(v)) )
stop("In hma(v, n), v must be an atomic, numeric vector")
# handle NA - need to think about this more
if( any(is.na(v)) )
stop("In hma(v, n), the vector, v, must not containe NA values")
# calculate the symmetric weights
weights <- hmaSymmetricWeights(n)
# construct the return series
l <- length(v)
r <- rep(as.numeric(NA), l) # r will be the vector we return
if( l < n ) return( r ) # handle short vectors
m <- as.integer((n-1)/2)
for( i in 1:l ) {
if( i <= m ) {
# asymmetric weights at the front end
w <- rev(hmaAsymmetricWeights(n, i+m, weights))
r[i] <- sum(v[1:(i+m)]*w)
}
# apply the symmetric weights to the middle of v
if( i > m && i <= l-m )
r[i] <- sum(v[(i-m):(i+m)]*weights)
if ( i > l-m ) {
# asymmetric weights at the back end
sz <- l - i + 1 + m
w <- hmaAsymmetricWeights(n, sz, weights)
r[i] <- sum(v[(i-m):l]*w)
}
}
return (r)
}
Next we will look at the functions I use for seasonal adjustment. It assumes a multiplicative model; and that there are no missing data items or series breaks. The eleven steps in the decomposition process are set out in the "main function" towards the bottom of the next code block.
# A hugely cut-down algorithm drawing on ideas from the X-11 ARIMA # seasonal adjustment package and the Australian Bureau of Statistics # Assume we have a time series where: Original(t) = Trend(t) * Seasonal(t) * Irregular(t) source("../bin/henderson.R") # for henderson moving average library(zoo) library(tseries) library(forecast) extend.series <- function(series.ts, extend.by=2) { # extend the series using an SARIMA fit and prediction at each end # to backcast and forecast the likely series trajectories # (We extend by two periods to avoid HMA endpoint issues) # initialise log.series.ts <- log(series.ts) start.series.c <- start(series.ts) # forward prediction forward.fit <- NULL tryCatch(forward.fit <- auto.arima(log.series.ts, ic='bic'), error=function(e) { print(e) } ) if(class(forward.fit) != 'Arima') return(NULL) prediction.log <- NULL tryCatch(prediction.log <- predict( forward.fit, n.ahead=(frequency(series.ts)*extend.by) ), error=function(e) { print(e) } ) if(class(prediction.log) != 'list') return(NULL) # reverse prediction backwards.fit <- NULL reverse.log.series.ts <- ts( data=rev(as.vector(log.series.ts)), frequency=frequency(series.ts) ) tryCatch(backwards.fit <- auto.arima(reverse.log.series.ts, ic='bic'), error=function(e) { print(e) } ) if(class(backwards.fit) != 'Arima') return(NULL) prediction.log.reverse <- NULL tryCatch(prediction.log.reverse <- predict( backwards.fit, n.ahead=(frequency(series.ts)*extend.by) ), error=function(e) { print(e) } ) if(class(prediction.log.reverse) != 'list') return(NULL) # construct the extended series extended.vector <- rev(exp(as.vector(prediction.log.reverse$pred))) extended.vector <- append(extended.vector, as.vector(series.ts) ) extended.vector <- append( extended.vector, exp(as.vector(prediction.log$pred)) ) start.series.c[1] <- start.series.c[1] - extend.by extended.ts <- ts(data=extended.vector, frequency=frequency(series.ts), start=start.series.c) # package up everything we know and return it in a named list ... return(list(extended=extended.ts, forward=forward.fit, backwards=backwards.fit)) } adjust.seasonal.components <- function(raw.components.ts) { # multipass filter to ensure the mean for any period is 1 loop.seasonal <- function(components.v, freq, len, beginning) { for(start in seq(beginning,len,freq)) { if(start + freq - 1 > len) break end <- start + freq - 1 multiplier <- freq / sum(components.v[start:end]) components.v[start:end] <- components.v[start:end] * multiplier } return(components.v) } components.v <- as.vector(raw.components.ts) freq <- frequency(raw.components.ts) len <- length(components.v) # standardise full years from month one components.v <- loop.seasonal(components.v, freq, len, 1) # standardise full years from month seven components.v <- loop.seasonal(components.v, freq, len, 1+trunc(freq/2)) # standardise the last full year components.v <- loop.seasonal(components.v, freq, len, len-freq+1) return( ts(components.v, start=start(raw.components.ts), frequency=frequency(raw.components.ts)) ) } smooth.seasonal.components <- function(raw.seasonal.ts) { median.filter <- function(vec, span) { n <- length(vec) halfspan <- trunc(span/2) ret.vec <- 1:n for(i in 1:n) { a <- max(1, i-halfspan) b <- min(n, i+halfspan) selection <- na.trim(vec[a:b]) if(length(selection)==0) ret.vec[i] <- NA else ret.vec[i] <- median(selection) } return(ret.vec) } k3x5 <- c(1/15, 2/15, 3/15, 3/15, 3/15, 2/15, 1/15) # from SEASABS # convert from ts to zooreg seasonal.zr <- zooreg(as.vector(raw.seasonal.ts), start=start(raw.seasonal.ts), frequency=frequency(raw.seasonal.ts)) for(period in 1:frequency(seasonal.zr)) { # get the column of data for period cases <- as.vector(seasonal.zr[cycle(seasonal.zr)==period,,drop=FALSE]) # smooth the column # first - take the median over n periods to remove worst case outliers, # but this will lead to a more spikey seasonally adjusted series, # and possibly more residual seasonality in the irregular component # so, we only do it with longer series ... if(length(raw.seasonal.ts) > (4 * frequency(raw.seasonal.ts))) cases <- median.filter(vec=cases, span=3) # second - use the filter above to further smooth cases.smoothed <- filter(cases, filter=k3x5, sides=2) # replicate the seasonal terms at the start and end of the series # - lost through the 7-term MA smoothing base = 4 if(is.na(cases.smoothed[base])) base = 5 for(i in 1:3) cases.smoothed[base-i] <- cases.smoothed[base] # base-1, base-2, base-3 n <- length(cases.smoothed) if(is.na(cases.smoothed[n-3])) n <- n - 1 for(i in 0:2) cases.smoothed[n-i] <- cases.smoothed[n-3] # n-0, n-1, n-2 # and put it back again seasonal.zr[cycle(seasonal.zr)==period] <- cases.smoothed } # convert back from zooreg to ts smooth.seasonal.ts <- as.ts(seasonal.zr) # re-adjust the data so that the 12 month seasonals sum to 12. smooth.seasonal.ts <- adjust.seasonal.components(smooth.seasonal.ts) return(smooth.seasonal.ts) } simple.trend.estimate <- function(series.ts) { # a period moving average to estimate the trend. # pick the right filter f <- NULL freq = frequency(series.ts) if(freq == 2) f <- c(1/4, 1/2, 1/4) if(freq == 4) f <- c(1/8, 1/4, 1/4, 1/4, 1/8) if(freq == 12) f <- c(1/24, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/24) # and move it over the series return( filter(series.ts, filter=f, sides=2) ) } validate <- function(series.ts, henderson) { # confirm input is a ts object if(!is.ts(series.ts)) stop('Series.ts should be a ts time series object') # remove the leading and trailing NA observations series.ts <- na.trim(series.ts) # confirm the frequency is greater than 1 freq <- frequency(series.ts) if(freq != 2 && freq != 4 && freq != 12) { print(freq) stop('Series.ts has an unsupported frequency') } # confirm the henderson term primes = c(5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109) if(!is.element(henderson, primes)) stop('Henderson moving average should be prime between 5 and 109') # confirm we have sufficient data for a time series analysis if(length(series.ts) < 4*freq) stop('Series.ts not sufficiently long') return(series.ts) } # main function ... get.seasonal <- function(original.ts, henderson=NULL, verbose=FALSE) { # set the smoothing term if(is.null(henderson)) { freq = frequency(original.ts) if(freq == 2) henderson <- 5 if(freq == 4) henderson <- 5 if(freq == 12) henderson <- 13 if(is.null(henderson)) henderson <- 29 # a number plucked from nowhere } henderson <- henderson[1] # not vectorised # validate the series - saves time debugging later series.ts <- validate(original.ts, henderson) # Step 1 -- first use SARIMA to extend the series a bit extended.l <- extend.series(original.ts) # Step 2 -- preliminary estimation of the trend using a moving average arima.is.ok <- !is.null(extended.l) if(arima.is.ok) { extended.ts <- extended.l$extended prelim.trend.ts <- simple.trend.estimate(extended.ts) # -- note extended.ts if(verbose) { cat('Forward ARIMA: '); print(extended.l$forward$arma) cat('Reverse ARIMA: '); print(extended.l$backwards$arma) } } else { # Because the ARIMA failed to find a good fit, # we will use a Henderson MA to estimate the trend # but this is probably an exercise in silliness. # let's smooth with a higher order henderson moving average # pick the next next highest prime after double the current value thisOrNextHighestPrime <- function(n) { n <- as.integer(n[1]) # integer and not vectorised if(!is.finite(n)) return(NA) if(n <= 2L) return(2L) if(n %% 2L == 0) n <- n + 1L # make odd isPrime <- function(n) all(n %% 2L:floor(sqrt(n)) != 0) # || n == 2L while(!isPrime(n)) n <- n + 2L # skip even return(n) } henderson <- thisOrNextHighestPrime(henderson * 2) # we will use straight up henderson moving average for initial trend extended.ts <- original.ts prelim.trend.ts <- ts( hma(as.vector(original.ts), henderson), start=start(original.ts), frequency=frequency(original.ts) ) if(verbose) print('No ARIMA model found') } # Step 3 -- first estimation of the seasonal.irregular component prelim.seasonal.ts <- extended.ts / prelim.trend.ts # -- note use of extended.ts # Step 4 -- let's smooth the seasonal . irregular component prelim.smooth.seasonal.ts <- smooth.seasonal.components(prelim.seasonal.ts) # Step 5 -- Preliminary estimate of the seasonally adjusted data # -- note use of extended.ts prelim.seasadj.ts <- na.trim(extended.ts / prelim.smooth.seasonal.ts) # Step 6 -- A better estimate of the trend better.trend.ts <- ts( hma(as.vector(prelim.seasadj.ts), henderson), start=start(prelim.seasadj.ts), frequency=frequency(prelim.seasadj.ts) ) # Step 7 -- A final estimate of the seasonal elements # -- note use of extended.ts better.seasonal.ts <- extended.ts / better.trend.ts final.seasonal.ts <- smooth.seasonal.components(better.seasonal.ts) final.seasonal.ts <- window(final.seasonal.ts, start=start(original.ts), end=end(original.ts)) # Step 8 -- final estimate of the seasonally adjusted series final.seasadj.ts <- original.ts / final.seasonal.ts # Step 9 -- final trend final.trend.ts <- ts( hma(as.vector(final.seasadj.ts), henderson), start=start(final.seasadj.ts), frequency=frequency(final.seasadj.ts) ) # step 10 -- final irregular final.irregular.ts <- final.seasadj.ts / final.trend.ts # Step 11 -- package up everything we know and # return it in a named list ... return(list(original=original.ts, trend=final.trend.ts, seasonal=final.seasonal.ts, irregular=final.irregular.ts, seasadj=final.seasadj.ts)) } seasonallyAdjust <- function(df, breaks, start, frequency, verbose=TRUE) { # Apply seasonal adjustment to selected columns in a data frame # returns an augmented data frame, such that: # -- seasonally adjusted in new column break.sa # -- trend in new column break.trend sa.suffix <- '.sa' trend.suffix <- '.trend' for(j in seq_along(breaks)) { if(verbose) { cat('seasonallyAdjust() Verbose: '); print(breaks[[j]]) } # - unique labels for the seasonal and trend components colName <- paste(breaks[[j]], sep='') colName.sa <- paste(colName, sa.suffix, sep='', collapse='') colName.trend <- paste(colName, trend.suffix, sep='', collapse='') # - seasonally decompose the series series <- df[, breaks[j]] series.ts <- ts(series, start=start, frequency=frequency) # --- code for seasonal adjustment using stl() #series.stl <- stl(series.ts, s.window='periodic', s.jump=1, t.jump=1, l.jump=1) #series.trend <- as.numeric(series.stl$time.series[, 'trend']) #series.sa <- series - as.numeric(series.stl$time.series[, 'seasonal']) series.season <- get.seasonal(original.ts=series.ts, verbose=verbose) series.trend <- as.numeric(series.season$trend) series.sa <- as.numeric(series.season$seasadj) # - store it and remember it df[, colName.sa] <- series.sa df[, colName.trend] <- series.trend } return(df) }
No comments:
Post a Comment