# 使用 k-Means 偵測異常資料

JamesMcCaffrey

## k 平均值演算法

``````[a] (61.0, 100.0)
[b] (64.0, 150.0)
[c] (70.0, 140.0)
``````

``````[m] = ((61.0 + 64.0 + 70.0) / 3, (100.0 + 150.0 + 140.0) / 3)
= (195.0 / 3, 390.0 / 3)
= (65.0, 130.0)
``````

``````dist(m,a) = sqrt((65.0 - 61.0)^2 + (130.0 - 100.0)^2)
= sqrt(4.0^2 + 30.0^2)
= sqrt(16.0 + 900.0)
= sqrt(916.0)
= 30.27
``````

Similarly:

``````dist(m,b) = sqrt((65.0 - 64.0)^2 + (130.0 - 150.0)^2)
= 20.02
dist(m,c) = sqrt((65.0 - 70.0)^2 + (130.0 - 140.0)^2)
= 11.18
``````

``````assign each tuple to a randomly selected cluster
compute the centroid for each cluster
loop until no improvement or until maxCount
assign each tuple to best cluster
(the cluster with closest centroid to tuple)
update each cluster centroid
(based on new cluster assignments)
end loop
return clustering
``````

## 程式的整體結構

``````using System;
namespace ClusteringKMeans
{
class ClusteringKMeansProgram
{
static void Main(string[] args)
{
try
{
Console.WriteLine("\nBegin outlier data detection demo\n");
string[] attributes = new string[] { "Height", "Weight" };
double[][] rawData = new double[20][];
rawData[0] = new double[] { 65.0, 220.0 };
rawData[1] = new double[] { 73.0, 160.0 };
rawData[2] = new double[] { 59.0, 110.0 };
rawData[3] = new double[] { 61.0, 120.0 };
rawData[4] = new double[] { 75.0, 150.0 };
rawData[5] = new double[] { 67.0, 240.0 };
rawData[6] = new double[] { 68.0, 230.0 };
rawData[7] = new double[] { 70.0, 220.0 };
rawData[8] = new double[] { 62.0, 130.0 };
rawData[9] = new double[] { 66.0, 210.0 };
rawData[10] = new double[] { 77.0, 190.0 };
rawData[11] = new double[] { 75.0, 180.0 };
rawData[12] = new double[] { 74.0, 170.0 };
rawData[13] = new double[] { 70.0, 210.0 };
rawData[14] = new double[] { 61.0, 110.0 };
rawData[15] = new double[] { 58.0, 100.0 };
rawData[16] = new double[] { 66.0, 230.0 };
rawData[17] = new double[] { 59.0, 120.0 };
rawData[18] = new double[] { 68.0, 210.0 };
rawData[19] = new double[] { 61.0, 130.0 };
Console.WriteLine("\nRaw data:\n");
ShowMatrix(rawData, rawData.Length, true);
int numAttributes = attributes.Length;
int numClusters = 3;
int maxCount = 30;
Console.WriteLine("\nk = " + numClusters + " and maxCount = " + maxCount);
int[] clustering = Cluster(rawData, numClusters, numAttributes, maxCount);
Console.WriteLine("\nClustering complete");
Console.WriteLine("\nClustering in internal format: \n");
ShowVector(clustering, true);
Console.WriteLine("\nClustered data:");
ShowClustering(rawData, numClusters, clustering, true);
double[] outlier = Outlier(rawData, clustering, numClusters, 0);
Console.WriteLine("Outlier for cluster 0 is:");
ShowVector(outlier, true);
Console.WriteLine("\nEnd demo\n");
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
} // Main
// 14 short static method definitions here
}
}
``````

## 計算聚類質心

``````static void UpdateMeans(double[][] rawData, int[] clustering,
double[][] means)
{
int numClusters = means.Length;
for (int k = 0; k < means.Length; ++k)
for (int j = 0; j < means[k].Length; ++j)
means[k][j] = 0.0;
int[] clusterCounts = new int[numClusters];
for (int i = 0; i < rawData.Length; ++i)
{
int cluster = clustering[i];
++clusterCounts[cluster];
for (int j = 0; j < rawData[i].Length; ++j)
means[cluster][j] += rawData[i][j];
}
for (int k = 0; k < means.Length; ++k)
for (int j = 0; j < means[k].Length; ++j)
means[k][j] /= clusterCounts[k]; // danger
return;
}
``````

``````static double[][] Allocate(int numClusters, int numAttributes)
{
double[][] result = new double[numClusters][];
for (int k = 0; k < numClusters; ++k)
result[k] = new double[numAttributes];
return result;
}
``````

means 陣列中的第一個索引表示聚類 ID，第二個索引表示屬性。 例如，如果 means[0][1] = 150.33，則聚類 0 中元組的體重 (1) 值的平均值為 150.33。

``````static double[] ComputeCentroid(double[][] rawData, int[] clustering,
int cluster, double[][] means)
{
int numAttributes = means[0].Length;
double[] centroid = new double[numAttributes];
double minDist = double.MaxValue;
for (int i = 0; i < rawData.Length; ++i) // walk thru each data tuple
{
int c = clustering[i];
if (c != cluster) continue;
double currDist = Distance(rawData[i], means[cluster]);
if (currDist < minDist)
{
minDist = currDist;
for (int j = 0; j < centroid.Length; ++j)
centroid[j] = rawData[i][j];
}
}
return centroid;
}
``````

``````static void UpdateCentroids(double[][] rawData, int[] clustering,
double[][] means, double[][] centroids)
{
for (int k = 0; k < centroids.Length; ++k)
{
double[] centroid = ComputeCentroid(rawData, clustering, k, means);
centroids[k] = centroid;
}
}
``````

## Distance 函數和資料正常化

``````static double Distance(double[] tuple, double[] vector)
{
double sumSquaredDiffs = 0.0;
for (int j = 0; j < tuple.Length; ++j)
sumSquaredDiffs += Math.Pow((tuple[j] - vector[j]), 2);
return Math.Sqrt(sumSquaredDiffs);
}
``````

## 將每個元組分配給一個聚類

``````static bool Assign(double[][] rawData,
nt[] clustering, double[][] centroids)
{
int numClusters = centroids.Length;
bool changed = false;
double[] distances = new double[numClusters];
for (int i = 0; i < rawData.Length; ++i)
{
for (int k = 0; k < numClusters; ++k)
distances[k] = Distance(rawData[i], centroids[k]);
int newCluster = MinIndex(distances);
if (newCluster != clustering[i])
{
changed = true;
clustering[i] = newCluster;
}
}
return changed;
}
``````

Here’s helper method MinIndex:

``````static int MinIndex(double[] distances)
{
int indexOfMin = 0;
double smallDist = distances[0];
for (int k = 0; k < distances.Length; ++k)
{
if (distances[k] < smallDist)
{
smallDist = distances[k]; indexOfMin = k;
}
}
return indexOfMin;
}
``````

k 平均值演算法的這一實現假定至少總是有一個資料元組分配給每個聚類。 如圖 6 所示，方法 Assign 並不能避免聚類未獲分配元組的情況。 實際上，這通常不是問題。 但要避免這一錯誤情況卻有些棘手。 我通常使用的方法是創建一個名為 centroidIndexes 的陣列，該陣列與 centroids 陣列配合使用。 前面曾提到，陣列 centroids 存放質心值，例如，在圖 2 中 (61.0, 120.0) 是聚類 2 的質心。 陣列 centroidIndexes 存放關聯的元組索引，例如 [3]。 然後，在 Assign 方法中，第一步是將存放質心值的資料元組分配給每個聚類，然後該方法逐一查看其餘每個元組並將其分配給一個聚類。 這種方法可保證每個聚類都至少有一個元組。

## The Cluster Method

``````static int[] Cluster(double[][] rawData, int numClusters,
int numAttributes,  int maxCount)
{
bool changed = true;
int ct = 0;
int numTuples = rawData.Length;
int[] clustering = InitClustering(numTuples, numClusters, 0);
double[][] means = Allocate(numClusters, numAttributes);
double[][] centroids = Allocate(numClusters, numAttributes);
UpdateMeans(rawData, clustering, means);
UpdateCentroids(rawData, clustering, means, centroids);
while (changed == true && ct < maxCount)
{
++ct;
changed = Assign(rawData, clustering, centroids);
UpdateMeans(rawData, clustering, means);
UpdateCentroids(rawData, clustering, means, centroids);
}
return clustering;
}
``````

``````static int[] InitClustering(int numTuples,
int numClusters, int randomSeed)
{
Random random = new Random(randomSeed);
int[] clustering = new int[numTuples];
for (int i = 0; i < numClusters; ++i)
clustering[i] = i;
for (int i = numClusters; i < clustering.Length; ++i)
clustering[i] = random.Next(0, numClusters);
return clustering;
}
``````

InitClustering 方法首先將元組 0 到 num­Clusters-1 分別分配給聚類 0 到 numClusters-1，這樣每個聚類最初至少分配有一個元組。 其餘元組將分配給隨機播放的聚類。

## 查找異常資料

``````static double[] Outlier(double[][] rawData, int[] clustering,
int numClusters, int cluster)
{
int numAttributes = rawData[0].Length;
double[] outlier = new double[numAttributes];
double maxDist = 0.0;
double[][] means = Allocate(numClusters, numAttributes);
double[][] centroids = Allocate(numClusters, numAttributes);
UpdateMeans(rawData, clustering, means);
UpdateCentroids(rawData, clustering, means, centroids);
for (int i = 0; i < rawData.Length; ++i)
{
int c = clustering[i];
if (c != cluster) continue;
double dist = Distance(rawData[i], centroids[cluster]);
if (dist > maxDist)
{
maxDist = dist;
Array.Copy(rawData[i], outlier, rawData[i].Length);
}
}
return outlier;
}
``````

## 顯示常式

``````static void ShowMatrix(double[][] matrix)
{
for (int i = 0; i < numRows; ++i)
{
Console.Write("[" + i.ToString().PadLeft(2) + "]  ");
for (int j = 0; j < matrix[i].Length; ++j)
Console.Write(matrix[i][j].ToString("F1") + "  ");
Console.WriteLine("");
}
}
``````

``````static void ShowVector(int[] vector)
{
for (int i = 0; i < vector.Length; ++i)
Console.Write(vector[i] + " ");
Console.WriteLine("");
}
``````

``````static void ShowVector(double[] vector)
{
for (int i = 0; i < vector.Length; ++i)
Console.Write(vector[i].ToString("F1") + " ");
Console.WriteLine("");
}
``````

``````static void ShowClustering(double[][] rawData,
int numClusters, int[] clustering)
{
for (int k = 0; k < numClusters; ++k) // Each cluster
{
for (int i = 0; i < rawData.Length; ++i) // Each tuple
if (clustering[i] == k)
{
for (int j = 0; j < rawData[i].Length; ++j)
Console.Write(rawData[i][j].ToString("F1") + " ");
Console.WriteLine("");
}
Console.WriteLine("");
}
}
``````

## 總結

Dr.James McCaffrey 供職于 Volt Information Sciences, Inc.，在該公司他負責管理對華盛頓州雷蒙德市沃什灣 Microsoft 總部園區的軟體工程師進行的技術培訓。他參與過多項 Microsoft 產品的研發工作，其中包括 Internet Explorer 和 MSN Search。他是《.NET Test Automation Recipes》(Apress, 2006) 的作者，您可以通過以下電子郵箱位址與他聯繫：jammc@microsoft.com