User CB_EN_U4CSE21417

About

Univariate Model – Applied Ridge Regression for optimum alpha value:

from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error X = df[['Open', 'High', 'Low', 'Volume']] y = df['Close'] scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) ridge = Ridge() alpha_range = np.logspace(-4, 4, 50) param_grid = {'alpha': alpha_range} grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(X_train, y_train) best_alpha = grid_search.best_params_['alpha'] print(f"Optimum alpha value: {best_alpha}") ridge_best = Ridge(alpha=best_alpha) ridge_best.fit(X_train, y_train) y_pred = ridge_best.predict(X_test) mse = mean_squared_error(y_test, y_pred) print(f"Mean Squared Error on test set: {mse}”)

Regression Relationship plot:

predictors = ['Open', 'High', 'Low', 'Volume'] plt.figure(figsize=(15, 10)) for i, predictor in enumerate(predictors, 1): plt.subplot(2, 2, i) sns.regplot(x=df[predictor], y=df['Close'], scatter_kws={'alpha':0.5}, line_kws={'color':'red'}) plt.title(f'Regression of Close on {predictor}') plt.xlabel(predictor) plt.ylabel('Close') plt.tight_layout() plt.show()

Trends, Seasonal and Residual for each component:

from statsmodels.tsa.seasonal import seasonal_decompose for column in ['Open', 'Close', 'High', 'Low', 'Volume']: series = df[column].dropna()
result = seasonal_decompose(series, model='additive', period=252)
plt.figure(figsize=(12, 8)) plt.subplot(411) plt.plot(result.observed) plt.title(f'Observed - {column}')

plt.subplot(412)
plt.plot(result.trend)
plt.title('Trend')
plt.subplot(413)
plt.plot(result.seasonal)
plt.title('Seasonal')
plt.subplot(414)
plt.plot(result.resid)
plt.title('Residual')
plt.tight_layout()
plt.show()

Create time based features

df['Date'] = pd.to_datetime(df['Date']) df['Year'] = df['Date'].dt.year df['Month'] = df['Date'].dt.month df['Week'] = df['Date'].dt.isocalendar().week df['Day'] = df['Date'].dt.day df['Day_of_Week'] = df['Date'].dt.dayofweek df['Quarter'] = df['Date'].dt.quarter df['Day_of_Year'] = df['Date'].dt.dayofyear df['Is_Month_Start'] = df['Date'].dt.is_month_start.astype(int) df['Is_Month_End'] = df['Date'].dt.is_month_end.astype(int) print(df.head())

Creating lag features:

df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) lag_days = [1, 2, 3] for lag in lag_days: df[f'Open_Lag_{lag}'] = df[‘Open’].shift(lag) df[f'High_Lag_{lag}'] = df['High'].shift(lag) df[f'Low_Lag_{lag}'] = df['Low'].shift(lag) df[f'Close_Lag_{lag}'] = df['Close'].shift(lag) df[f'Volume_Lag_{lag}'] = df['Volume'].shift(lag) print(df.head(10))

Power Ratios:

df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) df['Open_Close_Ratio'] = df['Open'] / df['Close'] df['High_Low_Ratio'] = df['High'] / df['Low'] df['Close_AdjClose_Ratio'] = df['Close'] / df['Adj Close'] df['Volume_High_Ratio'] = df['Volume'] / df['High'] df['Volume_Low_Ratio'] = df['Volume'] / df['Low'] print(df[['Open_Close_Ratio', 'High_Low_Ratio', 'Close_AdjClose_Ratio', 'Volume_High_Ratio', ‘Volume_Low_Ratio']].head(10))

Rolling window features

df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) window_size_7 = 7 window_size_14 = 14 df['Rolling_Mean_Close_7'] = df['Close'].rolling(window=window_size_7).mean() df['Rolling_Mean_Close_14'] = df['Close'].rolling(window=window_size_14).mean() df['Rolling_Std_Close_7'] = df['Close'].rolling(window=window_size_7).std() df['Rolling_Std_Close_14'] = df['Close'].rolling(window=window_size_14).std() df['Rolling_Max_Close_7'] = df['Close'].rolling(window=window_size_7).max() df['Rolling_Max_Close_14'] = df['Close'].rolling(window=window_size_14).max() df['Rolling_Min_Close_7'] = df['Close'].rolling(window=window_size_7).min() df['Rolling_Min_Close_14'] = df['Close'].rolling(window=window_size_14).min() print(df[['Close', 'Rolling_Mean_Close_7', 'Rolling_Mean_Close_14', 'Rolling_Std_Close_7', 'Rolling_Std_Close_14', 'Rolling_Max_Close_7', 'Rolling_Max_Close_14', 'Rolling_Min_Close_7', ‘Rolling_Min_Close_14’]].head(20))

Dimensionality Reduction :

PCA - Principal Component Analysis:

scaler = StandardScaler() X_scaled = scaler.fit_transform(df) pca = PCA(n_components=2) X_pca = pca.fit_transform(X_scaled) print("Explained variance ratio:", pca.explained_variance_ratio_) plt.figure(figsize=(8, 6)) plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.7) plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.title('PCA Result of Netflix Stock Data') plt.show()

Simple Moving Average (SMA)

import pandas as pd Import numpy as np df=pd.read_csv('NFLX.csv') df=df.drop(df.columns[0],axis=1) df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) window_size_7 = 7 window_size_14 = 14 window_size_30 = 30 df['SMA_7'] = df['Close'].rolling(window=window_size_7).mean() df['SMA_14'] = df['Close'].rolling(window=window_size_14).mean() df['SMA_30'] = df['Close'].rolling(window=window_size_30).mean() print(df[['Close', 'SMA_7', 'SMA_14', ‘SMA_30’]].head(20))

Visualise

import matplotlib.pyplot as plt plt.figure(figsize=(14, 7)) plt.plot(df['Close'], label='Close Price', color='blue', linewidth=1.5) plt.plot(df['SMA_7'], label='7-Day SMA', color='orange', linewidth=1.2) plt.plot(df['SMA_14'], label='14-Day SMA', color='green', linewidth=1.2) plt.plot(df['SMA_30'], label='30-Day SMA', color='red', linewidth=1.2) plt.title('Tata Motors Stock Price with Simple Moving Averages') plt.xlabel(‘Date') plt.ylabel('Price') plt.legend() plt.grid(True) plt.show()

Exponential Moving Average (EMA):

df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) window_size_7 = 7 window_size_14 = 14 window_size_30 = 30 df['EMA_7'] = df['Close'].ewm(span=window_size_7, adjust=False).mean() df['EMA_14'] = df['Close'].ewm(span=window_size_14, adjust=False).mean() df['EMA_30'] = df['Close'].ewm(span=window_size_30, adjust=False).mean() print(df[['Close', 'EMA_7', 'EMA_14', ‘EMA_30']].head(20))

Cummulative Moving Averages:

df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) df['CMA'] = df['Close'].expanding().mean() print(df[['Close', ‘CMA’]].head(20)

Df.skew() df.kurtosis()

Setting Date-Time Index:

df=pd.read_csv('NFLX.csv') df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True)

correlation_matrix = df.corr() print(correlation_matrix) import seaborn as sns plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f') plt.title('Correlation Heatmap') plt.show()

REsampling

Hourly up sampling:

import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv('google_Train (2010-2022).csv') df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) df_hourly = df.resample('H').ffill()
plt.figure(figsize=(12, 6)) plt.plot(df_hourly['Close'], label='Hourly Upsampled') plt.title('Hourly Upsampled Data') plt.xlabel('Date') plt.ylabel('Close Price') plt.grid(True) plt.legend() plt.show()

30Minutely Upsampling

df_30min = df.resample('30T').ffill()
plt.figure(figsize=(12, 6)) plt.plot(df_30min['Close'], label='30-Minutely Upsampled') plt.title('30-Minutely Upsampled Data') plt.xlabel('Date') plt.ylabel('Close Price') plt.grid(True) plt.show()

Weekly Down-sampling for 2010

import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv('netflix_Train (2010-2022).csv') df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) df_2010 = df.loc['2010'] df_weekly = df_2010.resample('W').last()
plt.figure(figsize=(12, 6)) plt.plot(df_weekly['Close'], label='Weekly Downsampled', color='blue') plt.title('2010 Weekly Downsampled Data') plt.xlabel('Date') plt.ylabel('Close Price') plt.grid(True) plt.legend() plt.show()

Monthly Down sampling for 2010:

df_monthly = df_2010.resample('M').last()
plt.figure(figsize=(12, 6)) plt.plot(df_monthly['Close'], label='Monthly Downsampled', color='green') plt.title('2010 Monthly Downsampled Data') plt.xlabel('Date') plt.ylabel('Close Price') plt.grid(True) plt.legend() plt.show()

Quarterly down sampling for 2010 to 2022

import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv(‘netflix _Train (2010-2022).csv') df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) df_quarterly = df.resample('Q').last()
plt.figure(figsize=(12, 6)) plt.plot(df_quarterly['Close'], label='Quarterly Downsampled', color='red') plt.title('Quarterly Downsampled Data (2010-2022)') plt.xlabel('Date') plt.ylabel('Close Price') plt.grid(True) plt.legend() plt.show()

Yearly Down sampling

df_yearly = df.resample('Y').last()
plt.figure(figsize=(12, 6)) plt.plot(df_yearly['Close'], label='Yearly Downsampled', color='purple') plt.title('Yearly Downsampled Data (2010-2022)') plt.xlabel('Date') plt.ylabel('Close Price') plt.grid(True) plt.legend()n plt.show()

Interpolation

Linear Interpolation

import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv(' netflix _Train (2010-2022).csv') df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) df.loc['2010-01-04', 'Close'] = None df_linear = df.interpolate(method='linear') plt.figure(figsize=(12, 6)) plt.plot(df['Close'], label='Original', alpha=0.6) plt.plot(df_linear['Close'], label='Linear Interpolation', linestyle='--') plt.title('Linear Interpolation') plt.xlabel('Date') plt.ylabel('Close Price') plt.grid(True) plt.legend() plt.show()

Polynomial Interpolation

Polynomial interpolation (order=2)

df_poly = df.interpolate(method='polynomial', order=2) plt.figure(figsize=(12, 6)) plt.plot(df['Close'], label='Original', alpha=0.6) plt.plot(df_poly['Close'], label='Polynomial Interpolation', linestyle='--') plt.title('Polynomial Interpolation (Order 2)') plt.xlabel('Date') plt.ylabel('Close Price') plt.grid(True) plt.legend() plt.show()

Spline Interpolation

df_filtered = df.loc['2010':'2012'] df_monthly = df_filtered.resample('M').last() df_monthly.loc['2010-01-01', 'Close'] = None df_monthly.loc['2010-02-01', 'Close'] = None df_spline = df_monthly.interpolate(method='spline', order=3) plt.figure(figsize=(12, 6)) plt.plot(df_monthly['Close'], label='Original Monthly Data', alpha=0.6, marker='o') plt.plot(df_spline['Close'], label='Spline Interpolation (Cubic)', linestyle='--') plt.title('Monthly Data with Spline Interpolation (2010-2012)') plt.xlabel('Date') plt.ylabel('Close Price') plt.grid(True) plt.legend() plt.show()

Model Selection

import numpy as np import pandas as pd import matplotlib.pyplot as plt from statsmodels.tsa.arima.model import ARIMA # Corrected import for ARIMA from statsmodels.tsa.stattools import adfuller # For stationarity test import pmdarima as pm # For auto_arima plt.figure(figsize=(12, 12)) series = df['Close'] ar_orders = [1, 4, 6, 10] fitted_model_dict = {} for idx, ar_order in enumerate(ar_orders): ar_model = ARIMA(series, order=(ar_order, 0, 0)) # Use ARIMA instead of ARMA ar_model_fit = ar_model.fit() fitted_model_dict[ar_order] = ar_model_fit plt.subplot(4, 1, idx + 1) plt.plot(series) plt.plot(ar_model_fit.fittedvalues) plt.title('AR(%s) Fit' % ar_order, fontsize=16) plt.tight_layout() plt.show()

AIC

for ar_order in ar_orders: print('AIC for AR(%s): %s'%(ar_order, fitted_model_dict[ar_order].aic))

BIC

for ar_order in ar_orders: print('BIC for AR(%s): %s'%(ar_order, fitted_model_dict[ar_order].bic))

ADF Statistic

from statsmodels.tsa.stattools import adfuller from numpy import log result = adfuller(df['Close'].dropna())
print('ADF Statistic: %f' % result[0]) print('p-value: %f' % result[1])

Order of differencing (d) in ARIMA model

import matplotlib.pyplot as plt from statsmodels.graphics.tsaplots import plot_acf fig, axes = plt.subplots(3, 2, sharex=True, figsize=(10, 8)) axes[0, 0].plot(df['Close']) axes[0, 0].set_title('Original Series') plot_acf(df['Close'].dropna(), ax=axes[0, 1]) axes[1, 0].plot(df['Close'].diff()) axes[1, 0].set_title('1st Order Differencing') plot_acf(df['Close'].diff().dropna(), ax=axes[1, 1]) axes[2, 0].plot(df['Close'].diff().diff()) axes[2, 0].set_title('2nd Order Differencing') plot_acf(df['Close'].diff().diff().dropna(), ax=axes[2, 1]) plt.tight_layout() plt.show()

Order of the AR term (p)

import matplotlib.pyplot as plt from statsmodels.graphics.tsaplots import plot_pacf plt.rcParams.update({'figure.figsize': (9, 3), 'figure.dpi': 120}) fig, axes = plt.subplots(1, 2, sharex=True) axes[0].plot(df['Close'].diff()) axes[0].set_title('1st Differencing') axes[1].set(ylim=(0, 5)) plot_pacf(df['Close'].diff().dropna(), ax=axes[1]) plt.show()

Building the ARIMA Model

from statsmodels.tsa.arima.model import ARIMA model = ARIMA(df['Close'], order=(1,1,2)) model_fit = model.fit() print(model_fit.summary())

Rebuilding the model without the MA2 term

1,1,1 ARIMA Model model = ARIMA(df.value, order=(1,1,1)) model_fit = model.fit(disp=0) print(model_fit.summary())

Plot the Residuals

model = ARIMA(df['Close'], order=(1, 1, 2)) model_fit = model.fit() residuals = pd.DataFrame(model_fit.resid) fig, ax = plt.subplots(1, 2, figsize=(12, 5)) residuals.plot(title="Residuals", ax=ax[0]) residuals.plot(kind='kde', title='Density', ax=ax[1]) plt.show()

Optimal ARIMA model manually using Out-of-Time Cross validation

import pandas as pd import matplotlib.pyplot as plt from statsmodels.tsa.arima.model import ARIMA train = df['Close'][:85] test = df['Close'][85:] model = ARIMA(train, order=(3, 2, 1))
fitted = model.fit() print(fitted.summary()) forecast = fitted.get_forecast(steps=15) fc = forecast.predicted_mean conf = forecast.conf_int(alpha=0.05) fc_series = pd.Series(fc, index=test.index) lower_series = pd.Series(conf.iloc[:, 0], index=test.index) upper_series = pd.Series(conf.iloc[:, 1], index=test.index) plt.figure(figsize=(12, 5), dpi=100) plt.plot(train, label='Training', color='blue') plt.plot(test, label='Actual', color='orange') plt.plot(fc_series, label='Forecast', color='red') plt.fill_between(lower_series.index, lower_series, upper_series, color='gray', alpha=0.3) plt.title('Forecast vs Actuals') plt.xlabel('Date') plt.ylabel('Close Price') plt.legend(loc='upper left', fontsize=8) plt.grid() plt.show()

Accuracy Metrics for Time Series Forecast

import numpy as np from statsmodels.tsa.stattools import acf test = df['Close'].values[-15:] def forecast_accuracy(forecast, actual): mape = np.mean(np.abs(forecast - actual) / np.abs(actual)) me = np.mean(forecast - actual) mae = np.mean(np.abs(forecast - actual)) mpe = np.mean((forecast - actual) / actual) rmse = np.mean((forecast - actual) 2) 0.5 corr = np.corrcoef(forecast, actual)[0, 1] mins = np.amin(np.hstack([forecast[:, None], actual[:, None]]), axis=1) maxs = np.amax(np.hstack([forecast[:, None], actual[:, None]]), axis=1) minmax = 1 - np.mean(mins / maxs) acf1 = acf(forecast - actual)[1] return { 'mape': mape, 'me': me, 'mae': mae, 'mpe': mpe, 'rmse': rmse, 'acf1': acf1, 'corr': corr, 'minmax': minmax } accuracy_metrics = forecast_accuracy(fc, test) print(accuracy_metrics)

Auto Arima Forecast in Python

import pmdarima as pm model_arima = pm.auto_arima(df['Close'],
trace=True, error_action='ignore', test='adf',
start_p=1, start_q=1, max_p=5, max_q=5, d=None, seasonal=False, suppress_warnings=True, stepwise=True)
print(model_arima.summary())

Interpreting the residual plots in ARIMA model

model.plot_diagnostics(figsize=(10,15)) plt.show()

Final Forecast

n_periods = 24 fc, confint = model.predict(n_periods=n_periods, return_conf_int=True) index_of_fc = np.arange(len(df['Close']), len(df['Close']) + n_periods) fc_series = pd.Series(fc, index=index_of_fc) lower_series = pd.Series(confint[:, 0], index=index_of_fc) upper_series = pd.Series(confint[:, 1], index=index_of_fc) plt.plot(df['Close']) plt.plot(fc_series, color='darkgreen') plt.fill_between(lower_series.index, lower_series, upper_series, color='k', alpha=.15) plt.title("Final Forecast of Stock Prices") plt.show()

Stationarity

import pandas as pd import matplotlib.pyplot as plt df['Date'] = pd.to_datetime(df['Date']) def plot_df(df, x, y, title="", xlabel='Date', ylabel='Value', dpi=100): plt.figure(figsize=(16, 5), dpi=dpi) plt.plot(x, y, color='tab:red') plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel) plt.show() plot_df(df, x=df['Date'], y=df['Close'], title='Netflix Stock Closing Prices from 2002 to 2024')

Boxplot of Month-wise (Seasonal) and Year-wise (trend) Distribution

df['Date'] = pd.to_datetime(df['Date']) df['year'] = df['Date'].dt.year df['month'] = df['Date'].dt.strftime('%b') years = df['year'].unique() fig, axes = plt.subplots(1, 2, figsize=(20, 7), dpi=80) sns.boxplot(x='year', y='Close', data=df, ax=axes[0]) axes[0].set_title('Year-wise Box Plot\n(The Trend)', fontsize=18) sns.boxplot(x='month', y='Close', data=df.loc[~df.year.isin([1991, 2008]), :], ax=axes[1]) axes[1].set_title('Month-wise Box Plot\n(The Seasonality)', fontsize=18) plt.show()

Testing Stationarity - ADH test and KPSS test

from statsmodels.tsa.stattools import adfuller, kpss result = adfuller(df['Close'].values, autolag='AIC') print(f'ADF Statistic: {result[0]}') print(f'p-value: {result[1]}') for key, value in result[4].items(): print('Critical Values:') print(f' {key}, {value}') result = kpss(df['Close'].values, regression='c') print('\nKPSS Statistic: %f' % result[0]) print('p-value: %f' % result[1]) for key, value in result[3].items(): print('Critical Values:') print(f' {key}, {value}')