binder

sktime 的预测功能概述#

作者:Joanna Lenczuk#

本笔记本中的代码在这篇文章中得到全面解释:https://towardsdatascience.com/why-start-using-sktime-for-forecasting-8d6881c0a518

[1]:
# %%capture
# !pip install numpy
# !pip install pandas
# !pip install seaborn
# !pip install 'sktime[all_extras]'
[2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.arima import AutoARIMA
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from sktime.forecasting.ets import AutoETS
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.tbats import TBATS
from sktime.forecasting.theta import ThetaForecaster
from sktime.forecasting.compose import (
    EnsembleForecaster,
    MultiplexForecaster,
    TransformedTargetForecaster,
    make_reduction,
    ForecastingPipeline,
    DirectTabularRegressionForecaster
)
from sktime.forecasting.model_selection import (
    SlidingWindowSplitter,
    ExpandingWindowSplitter,
    ForecastingGridSearchCV,
    ForecastingRandomizedSearchCV)
from sktime.forecasting.model_evaluation import evaluate
from sktime.transformations.series.detrend import Detrender, Deseasonalizer
from sktime.transformations.series.boxcox import BoxCoxTransformer, LogTransformer
from sktime.transformations.series.outlier_detection import HampelFilter
from sktime.transformations.compose import OptionalPassthrough
from sktime.performance_metrics.forecasting import (
    mean_absolute_scaled_error,
    median_absolute_percentage_error,
    make_forecasting_scorer,
    MeanAbsolutePercentageError,
    MedianAbsolutePercentageError)
from sktime.utils.plotting import plot_series
[3]:
import sktime
sktime.__version__
[3]:
'0.24.1'
[4]:
import warnings
warnings.filterwarnings("ignore")
[5]:
sns.set(rc={'figure.figsize':(12,6)})

导入数据#

[6]:
df = sm.datasets.macrodata.load_pandas()['data']
[7]:
df.head()
[7]:
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint
0 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00
1 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74
2 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09
3 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06
4 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19

数据概述#

数据集包含 1959 年至 2009 年间美国季度宏观经济数据。

目标变量 - ``realgdp``

[8]:
df['q_date'] = df.apply(lambda x: str(int(x['year'])) + "-" + 'Q'+ str(int(x['quarter'])), axis=1)
df['date'] = pd.PeriodIndex(df['q_date'], freq='Q').to_timestamp()
[9]:
df.head()
[9]:
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint q_date date
0 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00 1959-Q1 1959-01-01
1 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74 1959-Q2 1959-04-01
2 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09 1959-Q3 1959-07-01
3 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06 1959-Q4 1959-10-01
4 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19 1960-Q1 1960-01-01
[10]:
for val in df.columns[2:14]:
    plt.plot(df.date, df[val], label = val)
plt.legend();
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_12_0.png
[11]:
df.columns
[11]:
Index(['year', 'quarter', 'realgdp', 'realcons', 'realinv', 'realgovt',
       'realdpi', 'cpi', 'm1', 'tbilrate', 'unemp', 'pop', 'infl', 'realint',
       'q_date', 'date'],
      dtype='object')
[12]:
df = df.set_index('date').to_period("Q")
df.index
[12]:
PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203)

1. 统一 API#

基本预测器接口#

[13]:
y = df['realgdp']
y_train, y_test = temporal_train_test_split(y, test_size=30)
[14]:
forecaster = NaiveForecaster()
forecaster.fit(y_train)
y_pred = forecaster.predict(fh=1) #forecasting one step ahead
[15]:
y_pred
[15]:
2002Q2    11477.868
Freq: Q-DEC, Name: realgdp, dtype: float64

预测范围#

[16]:
fh_abs = ForecastingHorizon(y_test.index, is_relative=False) #specific data points
fh_abs
[16]:
ForecastingHorizon(['2002Q2', '2002Q3', '2002Q4', '2003Q1', '2003Q2', '2003Q3',
             '2003Q4', '2004Q1', '2004Q2', '2004Q3', '2004Q4', '2005Q1',
             '2005Q2', '2005Q3', '2005Q4', '2006Q1', '2006Q2', '2006Q3',
             '2006Q4', '2007Q1', '2007Q2', '2007Q3', '2007Q4', '2008Q1',
             '2008Q2', '2008Q3', '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', is_relative=False)

测试集上的预测

[17]:
#cutoff is the last point in the training series
cutoff = pd.Period("2002-01-01", freq="Q")

#forecasting 1, 2, ..., 30 steps ahead of the last point in the training series
fh_rel = fh_abs.to_relative(cutoff)
fh_rel
[17]:
ForecastingHorizon([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
           dtype='int64', is_relative=True)

样本内预测

[18]:
#cutoff is 29 quarters after the last point in the training series
cutoff_insample = pd.Period("2009-09-01", freq="Q")

#forecasting 0, 1, ..., 29 steps before the last point in the training series
fh_rel_insample = fh_abs.to_relative(cutoff_insample)
fh_rel_insample
[18]:
ForecastingHorizon([-29, -28, -27, -26, -25, -24, -23, -22, -21, -20, -19, -18, -17,
            -16, -15, -14, -13, -12, -11, -10,  -9,  -8,  -7,  -6,  -5,  -4,
             -3,  -2,  -1,   0],
           dtype='int64', is_relative=True)
[19]:
forecaster = NaiveForecaster()
forecaster.fit(y_train)
y_pred = forecaster.predict(fh=fh_rel)
y_pred_insample = forecaster.predict(fh=fh_rel_insample)

plot_series(y_train, y_test, y_pred, y_pred_insample, labels=["y_train", "y_test", "y_pred", "y_pred_insample"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_26_0.png

向前预测

[20]:
#cutoff is 12 quarters before the last point in the training series
cutoff_forward = pd.Period("1999-01-01", freq="Q")

#forecasting 13, 14, ..., 42 steps after the last point in the training series
fh_rel_forward = fh_abs.to_relative(cutoff_forward)
fh_rel_forward
[20]:
ForecastingHorizon([13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
            30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42],
           dtype='int64', is_relative=True)
[21]:
forecaster = NaiveForecaster()
forecaster.fit(y_train)
y_pred = forecaster.predict(fh=fh_rel)
y_pred_forward = forecaster.predict(fh=fh_rel_forward)

plot_series(y_train, y_test, y_pred,y_pred_forward, labels=["y_train", "y_test", "y_pred", "y_pred_forward"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_29_0.png

单变量时间序列#

[22]:
y = df['realgdp']
y_train, y_test = temporal_train_test_split(y, test_size=30)

forecaster = AutoARIMA(sp=4)
forecaster.fit(y_train)
y_pred = forecaster.predict(fh=fh_abs)

print(f"MASE: {round(mean_absolute_scaled_error(y_true=y_test, y_pred=y_pred, y_train=y_train), 3)}")
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
MASE: 2.864
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_31_1.png
[23]:
y = df['realgdp']
y_train, y_test = temporal_train_test_split(y, test_size=30)

forecaster = DirectTabularRegressionForecaster(LinearRegression())
forecaster.fit(y_train, fh=fh_abs) #forecasting horizon necessary in the fit() method for this forecaster
y_pred = forecaster.predict()

print(f"MASE: {round(mean_absolute_scaled_error(y_true=y_test, y_pred=y_pred, y_train=y_train), 3)}")
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
MASE: 5.496
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_32_1.png

多元外生时间序列#

使用外生变量进行预测(即多元 X)

[24]:
#creating lagged values to avoid data leakage
df.loc[:, 'realinv_lagged'] = df.loc[:, 'realinv'].shift()
df[['realinv_lagged']] = df[['realinv_lagged']].fillna(method='backfill')
[25]:
y = df['realgdp']
X = df[['realinv_lagged']] #could be more features
y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, fh=fh_abs)

forecaster = AutoARIMA(sp=4)
forecaster.fit(y_train, X_train)
y_pred = forecaster.predict(X=X_test, fh=fh_abs)

print(f"MASE: {round(mean_absolute_scaled_error(y_true=y_test, y_pred=y_pred, y_train=y_train), 3)}")
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
MASE: 6.708
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_35_1.png

更新预测器#

[26]:
y_old_data = y_train[:-4]
forecaster = AutoARIMA(sp=4)
forecaster.fit(y_old_data)
y_old_pred = forecaster.predict(fh_rel)
[27]:
#updating the model with new data (full year) and generating predictions for the next period
y_new_data = y_train[-4:]
forecaster.update(y_new_data)
y_new_pred = forecaster.predict(fh_rel)
[28]:
plot_series(y_train,  y_old_pred, y_new_pred,
            labels=["y_train",  "y_old_pred", 'y_new_pred']);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_39_0.png

2. 将预测任务转化为回归问题#

[29]:
regressor = LinearRegression()
forecaster = make_reduction(estimator=regressor, strategy='recursive', window_length=4)
forecaster.fit(y_train)
y_pred = forecaster.predict(fh_abs)

print(f"MASE: {round(mean_absolute_scaled_error(y_true=y_test, y_pred=y_pred, y_train=y_train), 3)}")
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
MASE: 4.104
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_41_1.png
[30]:
#direct reduction strategy
regressor = LinearRegression()
forecaster = DirectTabularRegressionForecaster(estimator=regressor, window_length=4)
forecaster.fit(y=y_train, fh=fh_abs)
y_pred = forecaster.predict()

print(f"MASE: {round(mean_absolute_scaled_error(y_true=y_test, y_pred=y_pred, y_train=y_train), 3)}")
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
MASE: 4.984
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_42_1.png

3. 预测模型的评估#

性能指标#

[31]:
forecaster = ThetaForecaster()
forecaster.fit(y_train)
y_pred = forecaster.predict(fh=fh_abs)
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_45_0.png
[32]:
#MdAPE function
median_absolute_percentage_error(y_test, y_pred)
[32]:
0.06012960019008075
[33]:
#MdAPE class
mdape = MedianAbsolutePercentageError()
mdape(y_true=y_test, y_pred=y_pred)
[33]:
0.06012960019008075
[34]:
def custom_mape(y_true: np.array, y_pred: np.array, multioutput: str):
    metrics_dict = {'uniform_average': np.mean(np.abs((y_true - np.ceil(y_pred)) / y_true)),
                    'raw_values': np.abs((y_true - np.ceil(y_pred)) / y_true)}
    try:
        return metrics_dict[multioutput]
    except KeyError:
        print("multioutput not specified correctly - pick `raw_values` or `uniform_average`")

[35]:
mape_changed = make_forecasting_scorer(func=custom_mape, multioutput = 'uniform_average')
mape_changed(y_test, y_pred)
[35]:
0.05751249071487726
[36]:
mape_changed = make_forecasting_scorer(func=custom_mape, multioutput = 'raw_values')
mape_changed(y_test, y_pred)
[36]:
array([0.00102004, 0.00391758, 0.002054  , 0.00402024, 0.00977161,
       0.02408462, 0.03079155, 0.03550762, 0.04034922, 0.04539615,
       0.05159286, 0.0590479 , 0.06113469, 0.0663205 , 0.06915992,
       0.07935452, 0.08080733, 0.07912407, 0.08396799, 0.08487858,
       0.09019835, 0.09639655, 0.09933719, 0.0958987 , 0.09729706,
       0.08935353, 0.07486882, 0.05743802, 0.05383124, 0.05845428])

基于时间序列的交叉验证#

  • 扩展窗口 (Expanding Window)

[37]:
forecaster = AutoARIMA(sp=4)
cv = ExpandingWindowSplitter(step_length=30, fh=list(range(1,31)), initial_window=40)
cv_df = evaluate(forecaster=forecaster, y=y, cv=cv, strategy="update", return_data=True)
[38]:
cv_df
[38]:
test_MeanAbsolutePercentageError fit_time pred_time len_train_window cutoff y_train y_test y_pred
0 0.027279 2.230224 0.024685 40 1968Q4 date 1959Q1 2710.349 1959Q2 2778.801 195... date 1969Q1 4244.100 1969Q2 4256.460 196... 1969Q1 4211.384583 1969Q2 4248.007673 19...
1 0.022458 4.483943 0.045663 70 1976Q2 date 1959Q1 2710.349 1959Q2 2778.801 195... date 1976Q3 5154.072 1976Q4 5191.499 197... 1976Q3 5183.195992 1976Q4 5222.693348 19...
2 0.059587 3.957826 0.027062 100 1983Q4 date 1959Q1 2710.349 1959Q2 2778.801 195... date 1984Q1 6448.264 1984Q2 6559.594 198... 1984Q1 6385.598862 1984Q2 6428.366855 19...
3 0.049360 2.459181 0.020844 130 1991Q2 date 1959Q1 2710.349 1959Q2 2778.801 195... date 1991Q3 8037.538 1991Q4 8069.046 1... 1991Q3 8048.706257 1991Q4 8090.890623 19...
4 0.009310 9.126039 0.040927 160 1998Q4 date 1959Q1 2710.349 1959Q2 2778.801 1... date 1999Q1 10601.179 1999Q2 10684.049 1... 1999Q1 10623.174768 1999Q2 10727.402350 ...
[39]:
plot_series(
    y, *[cv_df["y_pred"].iloc[x] for x in range(5)],
    markers=["o", *["."]*5],
    labels=["y_true"] + [f"cv: {x}" for x in range(5)]
)
print(f"Average MAPE with Expanding Window: {round(cv_df['test_MeanAbsolutePercentageError'].mean(), 3)}")
plt.title('Cross-validation with Expanding Window');
Average MAPE with Expanding Window: 0.034
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_55_1.png
  • 滑动窗口 (Sliding Window)

[40]:
forecaster = AutoARIMA(sp=4)
cv = SlidingWindowSplitter(step_length=30, fh=list(range(1,31)), initial_window=40)
cv_df = evaluate(forecaster=forecaster, y=y, cv=cv, strategy="update", return_data=True)
[41]:
cv_df
[41]:
test_MeanAbsolutePercentageError fit_time pred_time len_train_window cutoff y_train y_test y_pred
0 0.027279 2.242027 0.024409 40 1968Q4 1959Q1 2710.349 1959Q2 2778.801 1959Q3 ... 1969Q1 4244.100 1969Q2 4256.460 1969Q3 ... 1969Q1 4211.384583 1969Q2 4248.007673 19...
1 0.032508 3.777926 0.056402 10 1976Q2 1974Q1 4909.617 1974Q2 4922.188 1974Q3 ... 1976Q3 5154.072 1976Q4 5191.499 1977Q1 ... 1976Q3 5178.306143 1976Q4 5227.665286 19...
2 0.017365 2.365881 0.022176 10 1983Q4 1981Q3 6030.184 1981Q4 5955.062 1982Q1 ... 1984Q1 6448.264 1984Q2 6559.594 1984Q3 ... 1984Q1 6386.849 1984Q2 6448.124 1984Q3 ...
3 0.011343 1.619386 0.020145 10 1991Q2 1989Q1 7806.603 1989Q2 7865.016 1989Q3 ... 1991Q3 8037.538 1991Q4 8069.046 1992Q1... 1991Q3 8080.539 1991Q4 8157.256 1992Q1...
4 0.052317 13.451261 0.017559 10 1998Q4 1996Q3 9488.879 1996Q4 9592.458 1997Q1... 1999Q1 10601.179 1999Q2 10684.049 1999Q3... 1999Q1 10749.487249 1999Q2 10779.693065 ...
[42]:
plot_series(
    y, *[cv_df["y_pred"].iloc[x] for x in range(5)],
    markers=["o", *["."]*5],
    labels=["y_true"] + [f"cv: {x}" for x in range(5)],
)
print(f"Average MAPE with Sliding Window: {round(cv_df['test_MeanAbsolutePercentageError'].mean(), 3)}")
plt.title('Cross-validation with Sliding Window');
Average MAPE with Sliding Window: 0.028
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_59_1.png

调优#

  • 使用统计模型进行网格搜索 (Grid Search)

[43]:
forecaster = AutoETS(sp=4, allow_multiplicative_trend=True)
params = {"trend": ["add", "mul"],
          "damped_trend": [True, False]}
[44]:
cv = SlidingWindowSplitter(initial_window=60, window_length=30)
fgs = ForecastingGridSearchCV(forecaster, strategy="refit", cv=cv, param_grid=params)
[45]:
fgs.fit(y_train)
y_pred = fgs.predict(fh_abs)
[46]:
fgs.best_params_
[46]:
{'damped_trend': False, 'trend': 'add'}
[47]:
fgs.best_score_
[47]:
0.00674066038278309
[48]:
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_67_0.png
  • 使用 scikit-learn 估计器进行随机搜索 (Randomized Search)

[49]:
regressor = LinearRegression()
forecaster = make_reduction(regressor, strategy="recursive")
params = {"window_length": list(range(2, 21))}
[50]:
cv = SlidingWindowSplitter(initial_window=60, window_length=30)
frs = ForecastingRandomizedSearchCV(forecaster, strategy="refit", cv=cv,
                                     param_distributions=params, n_iter=5, random_state=42)
[51]:
frs.fit(y_train)
y_pred = frs.predict(fh_abs)
[52]:
frs.best_params_
[52]:
{'window_length': 2}
[53]:
frs.best_score_
[53]:
0.006820844800594237
[54]:
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_74_0.png
  • 调优嵌套参数

Random Forest Regressor

[55]:
regressor = RandomForestRegressor(n_estimators=20)
forecaster = make_reduction(regressor)
[56]:
forecaster.get_params()
[56]:
{'estimator': RandomForestRegressor(n_estimators=20),
 'pooling': 'local',
 'transformers': None,
 'window_length': 10,
 'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__criterion': 'squared_error',
 'estimator__max_depth': None,
 'estimator__max_features': 1.0,
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 20,
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False}
[57]:
nested_params = {"window_length": list(range(2,21)),
                 "estimator__max_depth": list(range(5,16))}
[58]:
cv = SlidingWindowSplitter(initial_window=60, window_length=30)
nrcv = ForecastingRandomizedSearchCV(forecaster, strategy="refit", cv=cv,
                                     param_distributions=nested_params,
                                     n_iter = 5, random_state=42)
[59]:
nrcv.fit(y_train)
y_pred = nrcv.predict(fh_abs)
[60]:
nrcv.best_params_
[60]:
{'window_length': 5, 'estimator__max_depth': 8}
[61]:
nrcv.best_score_
[61]:
0.01392739416525403
[62]:
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_83_0.png

4. 新的预测功能#

变换器 (Transformers)#

``Deseasonalizer``(去季节化)

[63]:
transformer = Deseasonalizer(sp=4) #removing seasonality
y_train_transformed = transformer.fit_transform(y_train)
[64]:
y_train_inversed = transformer.inverse_transform(y_train_transformed) #inversing the seasonality removal
[65]:
#after inversing the transformation, values are again on the same scale as they were before
(np.round(y_train.values, 3) == np.round(y_train_inversed.values, 3)).all()
[65]:
True

``BoxCoxTransformer``

[66]:
forecaster = ThetaForecaster(sp=4)
transformer = BoxCoxTransformer(sp=4) #Box-Cox power transformation
y_train_transformed = transformer.fit_transform(y_train)
[67]:
forecaster.fit(y_train_transformed)
y_pred = forecaster.predict(fh=fh_abs)
y_pred_inversed = transformer.inverse_transform(y_pred)
[68]:
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred_transformed"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_93_0.png
[69]:
plot_series(y_train, y_test, y_pred_inversed, labels=["y_train", "y_test", "y_pred_inversed_transformation"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_94_0.png

管道 (Pipelines)#

  • ``TransformedTargetForecaster`` - 简单管道

[70]:
#two transformers and a forecaster
forecaster = TransformedTargetForecaster(
    [("deseasonalize", Deseasonalizer(sp=4)),
    ("box_cox", BoxCoxTransformer(sp=4)),
    ("forecast", ThetaForecaster(sp=4))]
)
[71]:
forecaster.fit(y_train)
y_pred = forecaster.predict(fh_abs)
[72]:
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_99_0.png
  • ForecastingPipeline - 包含外生变量的管道

[73]:
forecaster = ForecastingPipeline(
    [("hampel", HampelFilter()),
    ("log", LogTransformer()),
    ("forecaster", ThetaForecaster(sp=4))])
[74]:
forecaster.fit(y_train, X_train)
y_pred = forecaster.predict(fh=fh_abs, X=X_test)
[75]:
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_103_0.png

自动化模型选择#

  • ``MultiplexForecaster`` - 寻找最佳预测器

[76]:
forecaster = MultiplexForecaster(
    forecasters=[
        ("theta", ThetaForecaster(sp=4)),
        ("ets", ExponentialSmoothing(trend="add", sp=4)),
        ("autoarima", AutoARIMA(sp=4)),
    ],
)
[77]:
cv = SlidingWindowSplitter(initial_window=60, window_length=30)
forecaster_param_grid = {"selected_forecaster": ["theta", "ets", "autoarima"]}
gscv = ForecastingGridSearchCV(forecaster, cv=cv, param_grid=forecaster_param_grid)
[78]:
gscv.fit(y_train)
[78]:
ForecastingGridSearchCV(cv=SlidingWindowSplitter(initial_window=60,
                                                 window_length=30),
                        forecaster=MultiplexForecaster(forecasters=[('theta',
                                                                     ThetaForecaster(sp=4)),
                                                                    ('ets',
                                                                     ExponentialSmoothing(sp=4,
                                                                                          trend='add')),
                                                                    ('autoarima',
                                                                     AutoARIMA(sp=4))]),
                        param_grid={'selected_forecaster': ['theta', 'ets',
                                                            'autoarima']})
请重新运行此单元格以显示 HTML 表示或信任此笔记本。
[79]:
gscv.best_params_
[79]:
{'selected_forecaster': 'ets'}
[80]:
gscv.best_forecaster_
[80]:
MultiplexForecaster(forecasters=[('theta', ThetaForecaster(sp=4)),
                                 ('ets',
                                  ExponentialSmoothing(sp=4, trend='add')),
                                 ('autoarima', AutoARIMA(sp=4))],
                    selected_forecaster='ets')
请重新运行此单元格以显示 HTML 表示或信任此笔记本。
  • ``OptionalPassthrough`` - 寻找最佳变换

[81]:
forecaster = TransformedTargetForecaster([("deseasonalizer", OptionalPassthrough(Deseasonalizer())),
                                          ("forecaster", ExponentialSmoothing())])
forecaster.get_params()
[81]:
{'steps': [('deseasonalizer',
   OptionalPassthrough(transformer=Deseasonalizer())),
  ('forecaster', ExponentialSmoothing())],
 'deseasonalizer': OptionalPassthrough(transformer=Deseasonalizer()),
 'forecaster': ExponentialSmoothing(),
 'deseasonalizer__passthrough': False,
 'deseasonalizer__transformer': Deseasonalizer(),
 'deseasonalizer__transformer__model': 'additive',
 'deseasonalizer__transformer__sp': 1,
 'forecaster__damped_trend': False,
 'forecaster__damping_trend': None,
 'forecaster__initial_level': None,
 'forecaster__initial_seasonal': None,
 'forecaster__initial_trend': None,
 'forecaster__initialization_method': 'estimated',
 'forecaster__method': None,
 'forecaster__minimize_kwargs': None,
 'forecaster__optimized': True,
 'forecaster__random_state': None,
 'forecaster__remove_bias': False,
 'forecaster__seasonal': None,
 'forecaster__smoothing_level': None,
 'forecaster__smoothing_seasonal': None,
 'forecaster__smoothing_trend': None,
 'forecaster__sp': None,
 'forecaster__start_params': None,
 'forecaster__trend': None,
 'forecaster__use_boxcox': None,
 'forecaster__use_brute': True}
[82]:
cv = SlidingWindowSplitter(initial_window=60, window_length=30)
params = {"deseasonalizer__passthrough": [True, False],
          "deseasonalizer__transformer__model": ["additive", "multiplicative"]}

gscv = ForecastingGridSearchCV(forecaster=forecaster,
                               param_grid=params,
                               cv=cv)
[83]:
gscv.fit(y_train)
gscv.best_params_ #model performs better with the transformation
[83]:
{'deseasonalizer__passthrough': True,
 'deseasonalizer__transformer__model': 'additive'}

集成预测#

[84]:
ensemble = EnsembleForecaster(forecasters = [("tbats", TBATS()),
                                             ("autoarima", AutoARIMA())],
                              aggfunc = "min")
[85]:
ensemble.fit(y_train)
y_pred = ensemble.predict(fh_abs) #averaged predictions of both models
[86]:
ensemble.forecasters_
[86]:
[TBATS(), AutoARIMA()]
[87]:
plot_series(y_train, y_test, y_pred, labels=["y_train", "y_test", "y_pred"]);
../../_images/examples_blog_posts_Overview_of_sktime_functionalities_for_forecasting_119_0.png

使用 nbsphinx 生成。Jupyter 笔记本可在此处找到。