分析時間序列資料的六個圖表

本文在 Python 中用箱線圖、傅立葉變換、熵、自相關和 PCA 分析時間序列資料。

Python類庫

首先,這些是與 notebook 一起使用的庫。大多數程式碼都圍繞 NumPy 和 Pandas庫,因為資料主要以 Pandas Dataframe 表現的 NumPy 陣列。

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib.dates import date2num

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf

import warnings
warnings.filterwarnings("ignore")

匯入檔案

下載資料後,執行以下程式碼將其匯入。

df_orig = pd.read_csv('data/data.csv', 
usecols=['datetime', 'machine_status',
'sensor_00', 'sensor_10', 'sensor_20',
'sensor_30', 'sensor_40', 'sensor_50'])
df_orig['datetime'] = pd.to_datetime(df_orig['datetime'])
cond_1 = df_orig['datetime'] >= '2018-04-12 00:00:00'
cond_2 = df_orig['datetime'] <= '2018-04-19 00:00:00'
df_orig = df_orig[cond_1 & cond_2]
df_orig = pd.read_csv('data/data.csv')
print(type(df_orig['sensor_00'].iloc[0]),
type(df_orig['datetime'].iloc[0]))
df_orig['datetime'] = pd.to_datetime(df_orig['datetime'])
print(type(df_orig['sensor_00'].iloc[0]),
type(df_orig['datetime'].iloc[0]))

資料預處理

在進行視覺化之前,分析了本次資料的重複值和缺失值。並且刪除重複項的函式:

def drop_duplicates(df: pd.DataFrame(), subset: list = ['DATE_TIME']) -> pd.DataFrame():
df = df.drop_duplicates((subset))
return df
def fill_missing_date(df: pd.DataFrame(), column_datetime: str ='DATE_TIME'):
print(f'输入形状: {df.shape}')

data_s = df.drop([column_datetime], axis=1)
datetime_s = df[column_datetime].astype(str)

start_date = min(df[column_datetime])
end_date = max(df[column_datetime])
date_s = pd.date_range(start_date, end_date, freq="min").strftime('%Y-%m-%d %H:%M:%S')

data_processed_s = []
for date_val in date_s:
pos = np.where(date_val == datetime_s)[0]
assert len(pos) in [0, 1]
if len(pos) == 0:
data = [date_val] + [0] * data_s.shape[1]
elif len(pos) == 1:
data = [date_val] + data_s.iloc[pos].values.tolist()[0]
data_processed_s.append(data)

df_processed = pd.DataFrame(data_processed_s, columns=[column_datetime] + data_s.columns.values.tolist())
df_processed[column_datetime] = pd.to_datetime(df_processed[column_datetime])
print(f'输出形状: {df_processed.shape}')

return df_processed
df_processed = drop_duplicates(df_orig, subset=['datetime'])
df = fill_missing_date(df_processed, column_datetime='datetime')
df_data = df.drop(columns=['machine_status'], axis=1)
df_labels = df[['datetime', 'machine_status']].copy()
df_labels['machine_status'][df_labels['machine_status'] != 'BROKEN'] = 0
df_labels['machine_status'][df_labels['machine_status'] == 'BROKEN'] = 1

資料視覺化

現在,準備開始資料視覺化。這是感測器資料和異常情況的圖。

df_data_hour = df_data.groupby(
pd.Grouper(key='datetime',
axis=0, freq='H')).mean()
df_labels_hour = df_labels.groupby(
pd.Grouper(key='datetime',
axis=0, freq='H')).sum()
for name in df.columns:
if name not in ['datetime', 'machine_status']:
fig, axs = plt.subplots(1, 1, figsize=(15, 2))
axs.plot(df_data_hour[name], color='blue')
axs_twinx = axs.twinx()
axs_twinx.plot(df_labels_hour['machine_status'], color='red')
axs.set_title(name)
plt.show()
df_data_hour = df_data.groupby(pd.Grouper(key='datetime', axis=0, freq='H')).mean()
df_labels_hour = df_labels.groupby(pd.Grouper(key='datetime', axis=0, freq='H')).sum()

df_rollmean = df_data_hour.resample(rule='D').mean()
df_rollstd = df_data_hour.resample(rule='D').std()

for name in df.columns:
if name not in ['datetime', 'machine_status']:
fig, axs = plt.subplots(1, 1, figsize=(15, 2))
axs.plot(df_data_hour[name], color='blue', label='Original')
axs.plot(df_rollmean[name], color='red', label='Rolling Mean')
plt.plot(df_rollstd[name], color='black', label='Rolling Std' )
axs.set_title(name)
plt.legend()
plt.show()
df_boxplot = df_data.copy()
df_boxplot['date'] = df_boxplot['datetime'].dt.strftime('%Y-%m-%d')
for name in df_boxplot.columns:
if name not in ['datetime', 'date']:
fig, axs = plt.subplots(1, 1, figsize=(15, 2))
sns.boxplot(y=name, x='date', data=df_boxplot)
axs.set_ylabel('Value')
axs.set_title(name)
plt.show()
FFT的滑動視窗
def fft(data, nwindow=64, freq = 32):
ffts = []
for i in range(0, len(data)-nwindow, nwindow//2):
sliced = data[i:i+nwindow]
fft = np.abs(np.fft.rfft(sliced*np.hamming(nwindow))[:freq])
ffts.append(fft.tolist())
ffts = np.array(ffts)
return ffts

def data_plot(date_time, data, labels, ax):
ax.plot(date_time, data)
ax.set_xlim(date2num(np.min(date_time)), date2num(np.max(date_time)))
axs_twinx = ax.twinx()
axs_twinx.plot(date_time, labels, color='red')
ax.set_ylabel('Label')

def fft_plot(ffts, ax):
ax.imshow(np.flipud(np.rot90(ffts)), aspect='auto', cmap=matplotlib.cm.bwr,
norm=LogNorm(vmin=np.min(ffts), vmax=np.max(ffts)))
ax.set_xlabel('Timestamp')
ax.set_ylabel('Freq')

df_fourier = df_data.copy()
for name in df_boxplot.columns:
if name not in ['datetime', 'date']:
fig, axs = plt.subplots(2, 1, figsize=(15, 6))
data = df_fourier[name].to_numpy()
ffts = fft(data, nwindow=64, freq = 32)
data_plot(df_fourier['datetime'], data, df_labels['machine_status'], axs[0])
fft_plot(ffts, axs[1])
axs[0].set_title(name)
plt.show()
def entropy(data, nwindow=64, freq = 32):
entropy_s = []
for i in range(0, len(data)-nwindow, nwindow//2):
sliced = data[i:i+nwindow]
fft = np.abs(np.fft.rfft(sliced*np.hamming(nwindow))[:nwindow//2])
p = fft / np.sum(fft)
entropy = - np.sum(p * np.log(p))
entropy_s.append(entropy)
entropy_s = np.array(entropy_s)
return entropy_s

def data_plot(date_time, data, labels, ax):
ax.plot(date_time, data)
axs_twinx = ax.twinx()
axs_twinx.plot(date_time, labels, color='red')
ax.set_xlabel('Value')
ax.set_ylabel('Label')

def entropy_plot(data, ax):
ax.plot(data, c='k')
ax.set_xlabel('Timestamp')
ax.set_ylabel('Entropy')

df_entropy = df_data.copy()
for name in df_boxplot.columns:
if name not in ['datetime', 'date']:
fig, axs = plt.subplots(2, 1, figsize=(15, 6))
data = df_entropy[name].to_numpy()
entropy_s = entropy(data, nwindow=64, freq = 32)
data_plot(df_entropy['datetime'], data, df_labels['machine_status'], axs[0])
entropy_plot(entropy_s, axs[1])
axs[0].set_title(name)
plt.show()
x = df_data[df_data.columns].drop(columns=['datetime'])
scaler = StandardScaler()
pca = PCA()
pipeline = make_pipeline(scaler, pca)
pipeline.fit(x)

features = range(pca.n_components_)
plt.figure(figsize=(22, 5))
plt.bar(features, pca.explained_variance_ratio_)
plt.xlabel('PCA feature')
plt.ylabel('Variance')
plt.xticks(features)
plt.title("Importance of the Principal Components based on inertia")
plt.show()
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2'])

df_pca = df_data.copy()
df_pca['pca1'] = pd.Series(principalDf['pc1'].values, index=df.index)
df_pca['pca2'] = pd.Series(principalDf['pc2'].values, index=df.index)
print(df_pca.shape)
print(df_pca.head())
df_pca_hour = df_pca.groupby(pd.Grouper(key='datetime', axis=0, freq='H')).mean()
df_labels_hour = df_labels.groupby(pd.Grouper(key='datetime', axis=0, freq='H')).sum()
for name in df_pca.columns:
if name in ['pca1', 'pca2']:
fig, axs = plt.subplots(1, 1, figsize=(15, 2))
axs.plot(df_pca_hour[name], color='blue')
axs_twinx = axs.twinx()
axs_twinx.plot(df_labels_hour['machine_status'], color='red')
axs.set_title(name)
plt.show()
pca1 = principalDf['pc1'].pct_change()
autocorrelation = pca1.dropna().autocorr()
print('Autocorrelation is: ', autocorrelation)
plot_acf(pca1.dropna(), lags=20, alpha=0.05)
plt.show()
Autocorrelation is: 0.024363541344977133
for name in df_pca.columns:
if name not in ['datetime', 'date']:
print(f'{name}: ', adfuller(df_pca[name]))

文章推薦

餅圖變形記,肝了3000字,收藏就是學會!

--

--

這是一個專注於數據分析職場的內容部落格,聚焦一批數據分析愛好者,在這裡,我會分享數據分析相關知識點推送、(工具/書籍)等推薦、職場心得、熱點資訊剖析以及資源大盤點,希望同樣熱愛數據的我們一同進步! 臉書會有更多互動喔:https://www.facebook.com/shujvfenxi/

Love podcasts or audiobooks? Learn on the go with our new app.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
數據分析那些事

數據分析那些事

這是一個專注於數據分析職場的內容部落格,聚焦一批數據分析愛好者,在這裡,我會分享數據分析相關知識點推送、(工具/書籍)等推薦、職場心得、熱點資訊剖析以及資源大盤點,希望同樣熱愛數據的我們一同進步! 臉書會有更多互動喔:https://www.facebook.com/shujvfenxi/