from numpy import array
from numpy import mean, var, std, cov, corrcoef
from numpy.linalg import eig, inv, pinv, qr, lstsq
from sklearn.decomposition import PCA
from matplotlib import pyplot
Statistics
Introduction to Multivariate Statistics
Expected Value and Mean
= array([1,2,3,4,5,6])
v print(v)
[1 2 3 4 5 6]
= mean(v)
result print(result)
3.5
= array([
M 1,2,3,4,5,6],
[1,2,3,4,5,6]])
[print(M)
[[1 2 3 4 5 6]
[1 2 3 4 5 6]]
= mean(M, axis=0)
col_mean print(col_mean)
[1. 2. 3. 4. 5. 6.]
= mean(M, axis=1)
row_mean print(row_mean)
[3.5 3.5]
Variance and Standard Deviation
= array([1,2,3,4,5,6])
v print(v)
[1 2 3 4 5 6]
= var(v, ddof=1)
result print(result)
3.5
= array([
M 1,2,3,4,5,6],
[1,2,3,4,5,6]])
[print(M)
[[1 2 3 4 5 6]
[1 2 3 4 5 6]]
= var(M, ddof=1, axis=0)
col_var print(col_var)
[0. 0. 0. 0. 0. 0.]
= var(M, ddof=1, axis=1)
row_var print(row_var)
[3.5 3.5]
= std(M, ddof=1, axis=0)
col_std print(col_std)
[0. 0. 0. 0. 0. 0.]
= std(M, ddof=1, axis=1)
row_std print(row_std)
[1.87082869 1.87082869]
Covariance and Correlation
= array([1,2,3,4,5,6,7,8,9])
x print(x)
[1 2 3 4 5 6 7 8 9]
= array([9,8,7,6,5,4,3,2,1])
y print(y)
[9 8 7 6 5 4 3 2 1]
= cov(x,y)[0,1]
Sigma print(Sigma)
-7.5
= corrcoef(x,y)[0,1]
corr print(corr)
-1.0
Covariance Matrix
= array([
X 1, 5, 8],
[3, 5, 11],
[2, 4, 9],
[3, 6, 10],
[1, 5, 10]])
[print(X)
[[ 1 5 8]
[ 3 5 11]
[ 2 4 9]
[ 3 6 10]
[ 1 5 10]]
= cov(X.T)
Sigma print(Sigma)
[[1. 0.25 0.75]
[0.25 0.5 0.25]
[0.75 0.25 1.3 ]]
Principal Component Analysis
Calculate Principal Component Analysis
= array([
A 1,2],
[3,4],
[5,6]])
[print(A)
[[1 2]
[3 4]
[5 6]]
= mean(A.T, axis=1) M
= A - M C
= cov(C.T) V
= eig(V) values, vectors
print(vectors)
[[ 0.70710678 -0.70710678]
[ 0.70710678 0.70710678]]
print(values)
[8. 0.]
= vectors.T.dot(C.T)
P print(P.T)
[[-2.82842712 0. ]
[ 0. 0. ]
[ 2.82842712 0. ]]
Principal Component Analysis in scikit-learn
= array([
A 1,2],
[3,4],
[5,6]])
[print(A)
[[1 2]
[3 4]
[5 6]]
= PCA(2) pca
pca.fit(A)
PCA(n_components=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA(n_components=2)
print(pca.components_)
[[ 0.70710678 0.70710678]
[-0.70710678 0.70710678]]
print(pca.explained_variance_)
[8. 0.]
= pca.transform(A)
B print(B)
[[-2.82842712e+00 -2.22044605e-16]
[ 0.00000000e+00 0.00000000e+00]
[ 2.82842712e+00 2.22044605e-16]]
Linear Regression
= array([
data 0.05, 0.12],
[0.18, 0.22],
[0.31, 0.35],
[0.42, 0.38],
[0.5, 0.49]])
[print(data)
[[0.05 0.12]
[0.18 0.22]
[0.31 0.35]
[0.42 0.38]
[0.5 0.49]]
= data[:, 0], data[:,1]
X, y = X.reshape(len(X), 1) X
Linear Regression Dataset
pyplot.scatter(X, y) pyplot.show()
Solve via Inverse
= inv(X.T.dot(X)).dot(X.T).dot(y)
b print(b)
[1.00233226]
= X.dot(b) yhat
pyplot.scatter(X,y)='red')
pyplot.plot(X, yhat, color pyplot.show()
Solve via QR Decomposition
= qr(X)
Q, R = inv(R).dot(Q.T).dot(y)
b print(b)
[1.00233226]
= X.dot(b) yhat
pyplot.scatter(X, y)='red')
pyplot.plot(X, yhat, color pyplot.show()
Solve via SVD and Pseudoinverse
= pinv(X).dot(y)
b print(b)
[1.00233226]
= X.dot(b) yhat
pyplot.scatter(X, y)='red')
pyplot.plot(X, yhat, color pyplot.show()
Solve via Convenience Function
= lstsq(X, y)
b, residuals, rank, s print(b)
[1.00233226]
/tmp/ipykernel_21840/4284049170.py:1: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
b, residuals, rank, s = lstsq(X, y)
= X.dot(b) yhat
pyplot.scatter(X, y)='red')
pyplot.plot(X, yhat, color pyplot.show()