7. SVD (Singular Value Decomposition; 特異値分解)#

https://ohke.hateblo.jp/entry/2017/12/14/230500

7.1. SVDとは#

7.2. scikit-learnを使った実験#

import numpy as np
import pandas as pd
import plotly.express as px 
from sklearn.datasets import load_iris
#from sklearn.decomposition import TruncatedSVD
from scipy.linalg import svd
from sklearn.exceptions import NotFittedError
df = load_iris(as_frame=True)["frame"]
df.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0

7.3. NumPyを使って実装する#

class MySVD():
    def fit_transform(self, X,y=None):
        """ X = U\Sigma V^T
        U: (M,M) 左特異行列, 
        Σ: (N,M) 対角成分がXの特異値, 
        V: (N,N) 右特異行列, 
        """
        # C^T@Cの固有値と固有ベクトル
        XtX_eigen_values, XtX_eigen_vectors = np.linalg.eig(X.T@X) # (F,),(F,F)←(F,F)=(D,F).T@(D,F)
        # 特異値
        singular_values = XtX_eigen_values**(1/2) # (F,)
        sorted_singular_index = np.argsort(singular_values)[::-1] # (F,)
        # 特異値行列
        sigma = np.diag(singular_values[sorted_singular_index]) # (F,F) 対角要素のみ値が入っている
        # 右特異行列
        V = XtX_eigen_vectors[:,sorted_singular_index] #(F,F)
        # 左特異行列
        U = []
        for i in range(len(sigma_diag := sigma.diagonal())): # (F,)
            _u = (X @ V[:,i]) / sigma_diag[i] # (D,)=((D,F)@(F,))/(1,)
            U.append(_u)
        U = np.array(U).T # 
        return U, sigma_diag, V.T
    
X = df.drop("target", axis=1)
myoutput = MySVD().fit_transform(X)
output = svd(X)
print(myoutput[0].shape,myoutput[1].shape,myoutput[2].shape)
print(output[0].shape,output[1].shape,output[2].shape)
(150, 4) (4,) (4, 4)
(150, 150) (4,) (4, 4)
a = X@myoutput[2].T[:,1]
a.shape
(150,)
X.shape
(150, 4)
output[2]
array([[-0.75110816, -0.38008617, -0.51300886, -0.16790754],
       [ 0.2841749 ,  0.5467445 , -0.70866455, -0.34367081],
       [ 0.50215472, -0.67524332, -0.05916621, -0.53701625],
       [ 0.32081425, -0.31725607, -0.48074507,  0.75187165]])
output[1][:2]
array([95.95991387, 17.76103366])