## Saturday, November 30

### Home Brew Seasonal Adjustment

I have been using the R package stl() to undertake seasonal decompositions of time series data. However, it removes much more signal/noise than the approach used by by the Australian Bureau of Statistics. Because it produces a trend series that is much less noisy than that from the ABS, it is slower to identify turning points in the data (as can be seen in the next three charts).

I have looked around, but could not find another option that would run on my Apple Mac system. So I thought I would have a go at writing my own seasonal adjustment function (do it yourself seasonal adjustment). Before I get to my code, let's look at how it performed.

It appears to be much closer to the ABS benchmark. So on to the code. We will start with the Henderson moving average (which is used a number of times to smooth the series).

``````## henderson.R
## calculate a Henderson moving average

hma <- function(v, n)
{
hmaSymmetricWeights <- function(n)
{
# caluclate the vector of symmetric weights
# formula from ABS (2003), 'A Guide to Interpreting Time Series', page 41.
# returns a dictionary of symmetric Henderson weights indexed from 1 to n

# calculate the constant denominator
m <- as.integer((n-1)/2)
m1 <- (m+1)*(m+1)
m2 <- (m+2)*(m+2)
d <- as.double(8*(m+2)*(m2-1)*(4*m2-1)*(4*m2-9)*(4*m2-25))

# calculate the weights
w <- rep(NA, n) # 1:n
m3 <- (m+3)*(m+3)
for( j in 0:m ) {
j2 <- j*j
v <- (315*(m1-j2)*(m2-j2)*(m3-j2)*(3*m2-11*j2-16))/d
w[(m+1+j)] <- v
if (j > 0)
w[(m+1-j)] <- v
}
return(w)
}

hmaAsymmetricWeights <- function(n, mw, w)
{
# calculate the asymmetric end-weights
# formula from Mike Doherty (2001), 'The Surrogate Henderson Filters in X-11',
# Aust, NZ J of Stat. 43(4), 2001, pp901-999
# see formula (1) on page 903

# returns a dictionary of asymmetrical weights from 1 to mw;
# where mw is less than n, and
# w is the dictionary of symmetric henderson weights indexed from 1 to n

sumResidual <- sum(w[(mw+1):(n)])
sumEnd <- 0.0
for ( i in (mw+1):n )
sumEnd <- sumEnd + ((i)-((mw+1)/2.0)) * w[i]

ic <- 1.0
if (n >= 13 && n < 15)
ic <- 3.5
if (n >= 15)
ic <- 4.5

b2s2 <- (4.0/pi)/(ic*ic)
f1 <- as.double(sumResidual)/as.double((mw))

u <- 1:mw
for ( r in 1:mw ) {
calc1 <- (r- (mw+1)/2.0) * b2s2
calc2 <- 1 + ( mw*(mw-1)*(mw+1) /12.0 ) * b2s2
u[r] <- w[r] + f1 + ( calc1 / calc2 ) * sumEnd
}
return (u)
}

# --- the main body of the function

# test that n is an integer, odd and >= 5
n <- as.integer(n) # integer; not vectorised
if(! is.integer(n) )
stop("In hma(v, n), n must be an integer")
if(n < 5)
stop("In hma(v, n), n must be >= 5")
if( (n %% 2) == 0)
stop("In hma(v, n), n must be odd")

# confirm that v is a vector
if ( !(is.vector(v) && is.atomic(v) && is.numeric(v)) )
stop("In hma(v, n), v must be an atomic, numeric vector")

# handle NA - need to think about this more
if( any(is.na(v)) )
stop("In hma(v, n), the vector, v, must not containe NA values")

# calculate the symmetric weights
weights <- hmaSymmetricWeights(n)

# construct the return series
l <- length(v)
r <- rep(as.numeric(NA), l)    # r will be the vector we return
if( l < n ) return( r )        # handle short vectors
m <- as.integer((n-1)/2)
for( i in 1:l ) {
if( i <= m ) {
# asymmetric weights at the front end
w <- rev(hmaAsymmetricWeights(n, i+m, weights))
r[i] <- sum(v[1:(i+m)]*w)
}

# apply the symmetric weights to the middle of v
if( i > m && i <= l-m )
r[i] <- sum(v[(i-m):(i+m)]*weights)

if ( i > l-m ) {
# asymmetric weights at the back end
sz <- l - i + 1 + m
w <- hmaAsymmetricWeights(n, sz, weights)
r[i] <- sum(v[(i-m):l]*w)
}
}

return (r)
}``````

Next we will look at the functions I use for seasonal adjustment. It assumes a multiplicative model; and that there are no missing data items or series breaks. The eleven steps in the decomposition process are set out in the "main function" towards the bottom of the next code block.

```# A hugely cut-down algorithm drawing on ideas from the X-11 ARIMA
# seasonal adjustment package and the Australian Bureau of Statistics
# Assume we have a time series where: Original(t) = Trend(t) * Seasonal(t) * Irregular(t)

source("../bin/henderson.R") # for henderson moving average
library(zoo)
library(tseries)
library(forecast)

extend.series <- function(series.ts, extend.by=2)
{
# extend the series using an SARIMA fit and prediction at each end
# to backcast and forecast the likely series trajectories
# (We extend by two periods to avoid HMA endpoint issues)

# initialise
log.series.ts <- log(series.ts)
start.series.c <- start(series.ts)

# forward prediction
forward.fit <- NULL
tryCatch(forward.fit <- auto.arima(log.series.ts, ic='bic'),
error=function(e) {
print(e)
}
)
if(class(forward.fit) != 'Arima') return(NULL)
prediction.log <- NULL
tryCatch(prediction.log <- predict( forward.fit,
n.ahead=(frequency(series.ts)*extend.by) ),
error=function(e) {
print(e)
}
)
if(class(prediction.log) != 'list') return(NULL)

# reverse prediction
backwards.fit <- NULL
reverse.log.series.ts <- ts( data=rev(as.vector(log.series.ts)),
frequency=frequency(series.ts) )
tryCatch(backwards.fit <- auto.arima(reverse.log.series.ts, ic='bic'),
error=function(e) {
print(e)
}
)
if(class(backwards.fit) != 'Arima') return(NULL)
prediction.log.reverse <- NULL
tryCatch(prediction.log.reverse <- predict( backwards.fit,
n.ahead=(frequency(series.ts)*extend.by) ),
error=function(e) {
print(e)
}
)
if(class(prediction.log.reverse) != 'list') return(NULL)

# construct the extended series
extended.vector <- rev(exp(as.vector(prediction.log.reverse\$pred)))
extended.vector <- append(extended.vector, as.vector(series.ts) )
extended.vector <- append( extended.vector,
exp(as.vector(prediction.log\$pred)) )
start.series.c <- start.series.c - extend.by
extended.ts <- ts(data=extended.vector,
frequency=frequency(series.ts), start=start.series.c)

# package up everything we know and return it in a named list ...
return(list(extended=extended.ts, forward=forward.fit, backwards=backwards.fit))
}

adjust.seasonal.components <- function(raw.components.ts)
{
# multipass filter to ensure the mean for any period is 1

loop.seasonal <- function(components.v, freq, len, beginning)
{
for(start in seq(beginning,len,freq)) {
if(start + freq - 1 > len)
break
end <- start + freq - 1
multiplier <- freq / sum(components.v[start:end])
components.v[start:end] <- components.v[start:end] * multiplier
}
return(components.v)
}

components.v <- as.vector(raw.components.ts)
freq <- frequency(raw.components.ts)
len <- length(components.v)

# standardise full years from month one
components.v <- loop.seasonal(components.v, freq, len, 1)
# standardise full years from month seven
components.v <- loop.seasonal(components.v, freq, len, 1+trunc(freq/2))
# standardise the last full year
components.v <- loop.seasonal(components.v, freq, len, len-freq+1)

return( ts(components.v, start=start(raw.components.ts),
frequency=frequency(raw.components.ts)) )
}

smooth.seasonal.components <- function(raw.seasonal.ts)
{
median.filter <- function(vec, span)
{
n <- length(vec)
halfspan <- trunc(span/2)
ret.vec <- 1:n
for(i in 1:n)
{
a <- max(1, i-halfspan)
b <- min(n, i+halfspan)
selection <- na.trim(vec[a:b])
if(length(selection)==0)
ret.vec[i] <- NA
else
ret.vec[i] <- median(selection)
}
return(ret.vec)
}

k3x5 <- c(1/15, 2/15, 3/15, 3/15, 3/15, 2/15, 1/15) # from SEASABS

# convert from ts to zooreg
seasonal.zr <- zooreg(as.vector(raw.seasonal.ts),
start=start(raw.seasonal.ts),
frequency=frequency(raw.seasonal.ts))

for(period in 1:frequency(seasonal.zr)) {
# get the column of data for period
cases <- as.vector(seasonal.zr[cycle(seasonal.zr)==period,,drop=FALSE])

# smooth the column

# first - take the median over n periods to remove worst case outliers,
# but this will lead to a more spikey seasonally adjusted series,
# and possibly more residual seasonality in the irregular component
# so, we only do it with longer series ...
if(length(raw.seasonal.ts) > (4 * frequency(raw.seasonal.ts)))
cases <- median.filter(vec=cases, span=3)

# second - use the filter above to further smooth
cases.smoothed <- filter(cases, filter=k3x5, sides=2)

# replicate the seasonal terms at the start and end of the series
# - lost through the 7-term MA smoothing
base = 4
if(is.na(cases.smoothed[base]))
base = 5
for(i in 1:3)
cases.smoothed[base-i] <- cases.smoothed[base]  # base-1, base-2, base-3
n <- length(cases.smoothed)
if(is.na(cases.smoothed[n-3]))
n <- n - 1
for(i in 0:2)
cases.smoothed[n-i] <- cases.smoothed[n-3]  # n-0, n-1, n-2

# and put it back again
seasonal.zr[cycle(seasonal.zr)==period] <- cases.smoothed
}

# convert back from zooreg to ts
smooth.seasonal.ts <- as.ts(seasonal.zr)

# re-adjust the data so that the 12 month seasonals sum to 12.
smooth.seasonal.ts <- adjust.seasonal.components(smooth.seasonal.ts)

return(smooth.seasonal.ts)
}

simple.trend.estimate <- function(series.ts)
{
# a period moving average to estimate the trend.

# pick the right filter
f <- NULL
freq = frequency(series.ts)
if(freq == 2) f <- c(1/4, 1/2, 1/4)
if(freq == 4) f <- c(1/8, 1/4, 1/4, 1/4, 1/8)
if(freq == 12) f <- c(1/24, 1/12, 1/12, 1/12, 1/12, 1/12,
1/12, 1/12, 1/12, 1/12, 1/12, 1/12, 1/24)

# and move it over the series
return( filter(series.ts, filter=f, sides=2) )
}

validate <- function(series.ts, henderson)
{
# confirm input is a ts object
if(!is.ts(series.ts))
stop('Series.ts should be a ts time series object')

# remove the leading and trailing NA observations
series.ts <- na.trim(series.ts)

# confirm the frequency is greater than 1
freq <- frequency(series.ts)
if(freq != 2 && freq != 4 && freq != 12) {
print(freq)
stop('Series.ts has an unsupported frequency')
}

# confirm the henderson term
primes = c(5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109)
if(!is.element(henderson, primes))
stop('Henderson moving average should be prime between 5 and 109')

# confirm we have sufficient data for a time series analysis
if(length(series.ts) < 4*freq)
stop('Series.ts not sufficiently long')

return(series.ts)
}

# main function ...
get.seasonal <- function(original.ts, henderson=NULL, verbose=FALSE)
{
# set the smoothing term
if(is.null(henderson)) {
freq = frequency(original.ts)
if(freq == 2) henderson <- 5
if(freq == 4) henderson <- 5
if(freq == 12) henderson <- 13
if(is.null(henderson)) henderson <- 29 # a number plucked from nowhere
}
henderson <- henderson # not vectorised

# validate the series - saves time debugging later
series.ts <- validate(original.ts, henderson)

# Step 1 -- first use SARIMA to extend the series a bit
extended.l <- extend.series(original.ts)

# Step 2 -- preliminary estimation of the trend using a moving average
arima.is.ok <- !is.null(extended.l)
if(arima.is.ok) {
extended.ts <- extended.l\$extended
prelim.trend.ts <- simple.trend.estimate(extended.ts) # -- note extended.ts
if(verbose) {
cat('Forward ARIMA: '); print(extended.l\$forward\$arma)
cat('Reverse ARIMA: '); print(extended.l\$backwards\$arma)
}
}
else {
# Because the ARIMA failed to find a good fit,
# we will use a Henderson MA to estimate the trend
# but this is probably an exercise in silliness.

# let's smooth with a higher order henderson moving average
# pick the next next highest prime after double the current value

thisOrNextHighestPrime <- function(n) {
n <- as.integer(n) # integer and not vectorised
if(!is.finite(n)) return(NA)
if(n <= 2L) return(2L)

if(n %% 2L == 0) n <- n + 1L # make odd
isPrime <- function(n) all(n %% 2L:floor(sqrt(n)) != 0) # || n == 2L
while(!isPrime(n)) n <- n + 2L # skip even
return(n)
}

henderson <- thisOrNextHighestPrime(henderson * 2)

# we will use straight up henderson moving average for initial trend
extended.ts <- original.ts
prelim.trend.ts <- ts( hma(as.vector(original.ts), henderson),
start=start(original.ts),
frequency=frequency(original.ts) )
if(verbose) print('No ARIMA model found')
}

# Step 3 -- first estimation of the seasonal.irregular component
prelim.seasonal.ts <- extended.ts / prelim.trend.ts # -- note use of extended.ts

# Step 4 -- let's smooth the seasonal . irregular component
prelim.smooth.seasonal.ts <- smooth.seasonal.components(prelim.seasonal.ts)

# Step 5 -- Preliminary estimate of the seasonally adjusted data
# -- note use of extended.ts
prelim.seasadj.ts <- na.trim(extended.ts / prelim.smooth.seasonal.ts)

# Step 6 -- A better estimate of the trend
better.trend.ts <- ts( hma(as.vector(prelim.seasadj.ts), henderson),
start=start(prelim.seasadj.ts),
frequency=frequency(prelim.seasadj.ts) )

# Step 7 -- A final estimate of the seasonal elements
# -- note use of extended.ts
better.seasonal.ts <- extended.ts / better.trend.ts
final.seasonal.ts <- smooth.seasonal.components(better.seasonal.ts)
final.seasonal.ts <- window(final.seasonal.ts, start=start(original.ts),
end=end(original.ts))

# Step 8 -- final estimate of the seasonally adjusted series
final.seasadj.ts <- original.ts / final.seasonal.ts

# Step 9 -- final trend
final.trend.ts <- ts( hma(as.vector(final.seasadj.ts), henderson),
start=start(final.seasadj.ts),
frequency=frequency(final.seasadj.ts) )

# step 10 -- final irregular
final.irregular.ts <- final.seasadj.ts / final.trend.ts

# Step 11 -- package up everything we know and
# return it in a named list ...
return(list(original=original.ts, trend=final.trend.ts,
seasonal=final.seasonal.ts,
irregular=final.irregular.ts, seasadj=final.seasadj.ts))
}

seasonallyAdjust <- function(df, breaks, start, frequency, verbose=TRUE) {
# Apply seasonal adjustment to selected columns in a data frame
# returns an augmented data frame, such that:
# -- seasonally adjusted in new column break.sa
# -- trend in new column break.trend

sa.suffix <- '.sa'
trend.suffix <- '.trend'

for(j in seq_along(breaks)) {
if(verbose) { cat('seasonallyAdjust() Verbose: '); print(breaks[[j]]) }

# - unique labels for the seasonal and trend components
colName <- paste(breaks[[j]], sep='')
colName.sa <- paste(colName, sa.suffix, sep='', collapse='')
colName.trend <- paste(colName, trend.suffix, sep='', collapse='')

# - seasonally decompose the series
series <- df[, breaks[j]]
series.ts <- ts(series, start=start, frequency=frequency)

# --- code for seasonal adjustment using stl()
#series.stl <- stl(series.ts, s.window='periodic', s.jump=1, t.jump=1, l.jump=1)
#series.trend <- as.numeric(series.stl\$time.series[, 'trend'])
#series.sa <- series - as.numeric(series.stl\$time.series[, 'seasonal'])

series.season <- get.seasonal(original.ts=series.ts, verbose=verbose)
series.trend <- as.numeric(series.season\$trend)
series.sa <- as.numeric(series.season\$seasadj)

# - store it and remember it
df[, colName.sa] <- series.sa
df[, colName.trend] <- series.trend
}
return(df)
}```