import math import random import matplotlib.pyplot as plt from numpy import *
n_components = 2
def f1(x, period = 4): return 0.5(x-math.floor(x/period)period)
def create_data(): #data number n = 500 #data time T = [0.1*xi for xi in range(0, n)] #source S = array([[sin(xi) for xi in T], [f1(xi) for xi in T]], float32) #mix matrix A = array([[0.8, 0.2], [-0.3, -0.7]], float32) return T, S, dot(A, S)
def whiten(X): #zero mean X_mean = X.mean(axis=-1) X -= X_mean[:, newaxis] #whiten A = dot(X, X.transpose()) D , E = linalg.eig(A) D2 = linalg.inv(array([[D[0], 0.0], [0.0, D[1]]], float32)) D2[0,0] = sqrt(D2[0,0]); D2[1,1] = sqrt(D2[1,1]) V = dot(D2, E.transpose()) return dot(V, X), V
def do_fastica(X): n, m = X.shape; p = float(m); g = _logcosh #black magic X *= sqrt(X.shape[1]) #create w W = ones((n,n), float32) for i in range(n): for j in range(i): W[i,j] = random.random()
#compute W
maxIter = 200
for ii in range(maxIter):
gwtx, g_wtx = g(dot(W, X))
W1 = do_decorrelation(dot(gwtx, X.T) / p - g_wtx[:, newaxis] * W)
lim = max( abs(abs(diag(dot(W1, W.T))) - 1) )
W = W1
if lim < 0.0001:
break
return W
def show_data(T, S): plt.plot(T, [S[0,i] for i in range(S.shape[1])], marker=”*”) plt.plot(T, [S[1,i] for i in range(S.shape[1])], marker=”o”) plt.show()
def main(): T, S, D = create_data() Dwhiten, K = whiten(D) W = do_fastica(Dwhiten) #Sr: reconstructed source Sr = dot(dot(W, K), D) show_data(T, D) show_data(T, S) show_data(T, Sr)
##Python实现PCA
import numpy as np
def pca(X,k):#k is the components you want#mean of each feature
n_samples, n_features = X.shape
mean=np.array([np.mean(X[:,i]) for i in range(n_features)])
#normalization
norm_X=X-mean
#scatter matrix
scatter_matrix=np.dot(np.transpose(norm_X),norm_X)
#Calculate the eigenvectors and eigenvalues
eig_val, eig_vec = np.linalg.eig(scatter_matrix)
eig_pairs = [(np.abs(eig_val[i]), eig_vec[:,i]) for i in range(n_features)]
# sort eig_vec based on eig_val from highest to lowest
eig_pairs.sort(reverse=True)
# select the top k eig_vec
feature=np.array([ele[1] for ele in eig_pairs[:k]])
#get new data
data=np.dot(norm_X,np.transpose(feature))
return data
X = np.array([[-1, 1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
print(pca(X,1))
用sklearn的PCA与我们的PCA做个比较:
##用sklearn的PCA
from sklearn.decomposition import PCA
import numpy as np
X = np.array([[-1, 1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca=PCA(n_components=1)
pca.fit(X)
print(pca.transform(X))
奇异值分解(Singular Value Decompositionm,简称SVD)是在机器学习领域应用较为广泛的算法之一,也是学习机器学习算法绕不开的基石之一。SVD算法主要用在降维算法中的特征分解、推荐系统、自然语言处理计算机视觉等领域。奇异值分解(SVD)通俗一点讲就是将一个线性变换分解为两个线性变换,一个线性变换代表旋转,一个线性变换代表拉伸。
# -*- coding:utf-8 -*-
import numpy as np
from matplotlib import pyplot
class K_Means(object):
# k是分组数;tolerance‘中心点误差’;max_iter是迭代次数
def __init__(self, k=2, tolerance=0.0001, max_iter=300):
self.k_ = k
self.tolerance_ = tolerance
self.max_iter_ = max_iter
def fit(self, data):
self.centers_ = {}
for i in range(self.k_):
self.centers_[i] = data[i]
for i in range(self.max_iter_):
self.clf_ = {}
for i in range(self.k_):
self.clf_[i] = []
# print("质点:",self.centers_)
for feature in data:
# distances = [np.linalg.norm(feature-self.centers[center]) for center in self.centers]
distances = []
for center in self.centers_:
# 欧拉距离
# np.sqrt(np.sum((features-self.centers_[center])**2))
distances.append(np.linalg.norm(feature - self.centers_[center]))
classification = distances.index(min(distances))
self.clf_[classification].append(feature)
# print("分组情况:",self.clf_)
prev_centers = dict(self.centers_)
for c in self.clf_:
self.centers_[c] = np.average(self.clf_[c], axis=0)
# '中心点'是否在误差范围
optimized = True
for center in self.centers_:
org_centers = prev_centers[center]
cur_centers = self.centers_[center]
if np.sum((cur_centers - org_centers) / org_centers * 100.0) > self.tolerance_:
optimized = False
if optimized:
break
def predict(self, p_data):
distances = [np.linalg.norm(p_data - self.centers_[center]) for center in self.centers_]
index = distances.index(min(distances))
return index
if __name__ == '__main__':
x = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]])
k_means = K_Means(k=2)
k_means.fit(x)
print(k_means.centers_)
for center in k_means.centers_:
pyplot.scatter(k_means.centers_[center][0], k_means.centers_[center][1], marker='*', s=150)
for cat in k_means.clf_:
for point in k_means.clf_[cat]:
pyplot.scatter(point[0], point[1], c=('r' if cat == 0 else 'b'))
predict = [[2, 1], [6, 9]]
for feature in predict:
cat = k_means.predict(predict)
pyplot.scatter(feature[0], feature[1], c=('r' if cat == 0 else 'b'), marker='x')
pyplot.show()
2.1 优点
容易理解,聚类效果不错,虽然是局部最优, 但往往局部最优就够了;
处理大数据集的时候,该算法可以保证较好的伸缩性;
当簇近似高斯分布的时候,效果非常不错;
算法复杂度低。
2.2 缺点
K 值需要人为设定,不同 K 值得到的结果不一样;
对初始的簇中心敏感,不同选取方式会得到不同结果;
对异常值敏感;
样本只能归为一类,不适合多分类任务;
不适合太离散的分类、样本类别不平衡的分类、非凸形状的分类。
针对 K-means 算法的缺点,我们可以有很多种调优方式:如数据预处理(去除异常点),合理选择 K 值,高维映射等。
但是这个算法的缺点在于,难以并行化。所以 k-means II 改变取样策略,并非按照 k-means++ 那样每次遍历只取样一个样本,而是每次遍历取样 k 个,重复该取样过程 log(n)次,则得到 klog(n)个样本点组成的集合,然后从这些点中选取 k 个。当然一般也不需要 log(n)次取样,5 次即可。
下图描述了使用编码器—解码器将上述英语句子翻译成法语句子的一种方法。在训练数据集中,我们可以在每个句子后附上特殊符号“<eos>”(end of sequence)以表示序列的终止。编码器每个时间步的输入依次为英语句子中的单词、标点和特殊符号“<eos>”。下图中使用了编码器在最终时间步的隐藏状态作为输入句子的表征或编码信息。解码器在各个时间步中使用输入句子的编码信息和上个时间步的输出以及隐藏状态作为输入。我们希望解码器在各个时间步能正确依次输出翻译后的法语单词、标点和特殊符号”<eos>”。需要注意的是,解码器在最初时间步的输入用到了一个表示序列开始的特殊符号”<bos>”(beginning of sequence)。