# 使用 C# 进行 Naive Bayes 分类

James McCaffrey

Naive Bayes 分类是一种计算机学习方法，可用于预测特定的数据用例属于哪个类别。在本文中，我将介绍 Naive Bayes 分类的工作原理，并提供一个使用 C# 语言编码的示例。

## Naive Bayes 分类的工作原理

``````P(male | X) =
[ P(education | male) * P(right | male) * P(tall | male) * P(male) ] /
[ PP(male | X) + PP(female | X) ]
``````

``````P(education | male ) = count(education & male) / count(male) = 2/24 = 0.0833
``````

``````P(right | male) = count(right & male) / count(male) = 17/24 = 0.7083
P(tall | male) = count(tall & male) / count(male) = 4/24 = 0.1667
``````

``````P(education | female) = count(education & female) / count(female) = 4/16 = 0.2500
P(right | female) = count(right & female) / count(female) = 14/16 = 0.8750
P(tall | female) = count(tall & female) / count(female) = 2/16 = 0.1250
P(female) = 16/40 = 0.4000
``````

``````PP(male | X) = 0.0833 * 0.7083 * 0.1667 * 0.6000 = 0.005903
``````

``````PP(female | X) = 0.2500 * 0.8750 * 0.1250 * 0.4000 = 0.010938
``````

``````P(male | X) = 0.005903 / (0.005903 + 0.010938) = 0.3505
P(female | X) = 0.010938 / (0.005903 + 0.010938) = 0.6495
``````

## Laplacian 平滑处理

``````P(education | male ) =
count(education & male) + 1 / count(male) + 3 = 3/27 = 0.1111
P(right | male) =
count(right & male) + 1 / count(male) + 3 = 18/27 = 0.6667
P(tall | male) =
count(tall & male) + 1 / count(male) + 3 = 5/27 = 0.1852
P(male) = 24/40 = 0.6000
P(education | female) =
count(education & female) + 1 / count(female) + 3 = 5/19 = 0.2632
P(right | female) =
count(right & female) + 1 / count(female) + 3 = 15/19 = 0.7895
P(tall | female) =
count(tall & female) + 1 / count(female) + 3 = 3/19 = 0.1579
P(female) = 16/40 = 0.4000
``````

``````PP(male | X) = 0.1111 * 0.6667 * 0.1852 * 0.6000 = 0.008230
PP(female | X) = 0.2632 * 0.7895 * 0.1579 * 0.4000 = 0.013121
``````

``````P(male | X) = 0.008230 / (0.008230 + 0.013121) = 0.3855
P(female | X) = 0.013121 / (0.008230 + 0.013121) = 0.6145
``````

## 程序的整体结构

``````using System;
namespace NaiveBayes
{
class Program
{
static Random ran = new Random(25); // Arbitrary
static void Main(string[] args)
{
try
{
string[] attributes = new string[] { "occupation", "dominance",
"height", "sex"};
string[][] attributeValues = new string[attributes.Length][];
attributeValues[0] = new string[] { "administrative",
"construction", "education", "technology" };
attributeValues[1] = new string[] { "left", "right" };
attributeValues[2] = new string[] { "short", "medium", "tall" };
attributeValues[3] = new string[] { "male", "female" };
double[][] numericAttributeBorders = new double[1][];
numericAttributeBorders[0] = new double[] { 64.0, 71.0 };
string[] data = MakeData(40);
for (int i = 0; i < 4; ++i)
Console.WriteLine(data[i]);
string[] binnedData = BinData(data, attributeValues,
numericAttributeBorders);
for (int i = 0; i < 4; ++i)
Console.WriteLine(binnedData[i]);
int[][][] jointCounts = MakeJointCounts(binnedData, attributes,
attributeValues);
int[] dependentCounts = MakeDependentCounts(jointCounts, 2);
Console.WriteLine("Total male = " + dependentCounts[0]);
Console.WriteLine("Total female = " + dependentCounts[1]);
ShowJointCounts(jointCounts, attributeValues);
string occupation = "education";
string dominance = "right";
string height = "tall";
bool withLaplacian = true;
Console.WriteLine(" occupation = " + occupation);
Console.WriteLine(" dominance = " + dominance);
Console.WriteLine(" height = " + height);
int c = Classify(occupation, dominance, height, jointCounts,
dependentCounts, withLaplacian, 3);
if (c == 0)
Console.WriteLine("\nData case is most likely male");
else if (c == 1)
Console.WriteLine("\nData case is most likely female");
Console.WriteLine("\nEnd demo\n");
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
} // End Main
// Methods to create data
// Method to bin data
// Method to compute joint counts
// Helper method to compute partial probabilities
// Method to classify a data case
} // End class Program
}
``````

Main 中调用的主要方法及其用途如下：BinData 用于分类身高数据；MakeJointCounts 用于扫描装箱数据并计算结点计数；MakeDependentCounts 用于计算男性和女性的总人数；Classify 使用结点计数和因变量计数执行 Naive Bayes 分类。

## 数据装箱

``````static string[] BinData(string[] data, string[][] attributeValues,
double[][] numericAttributeBorders)
{
string[] result = new string[data.Length];
string[] tokens;
double heightAsDouble;
string heightAsBinnedString;
for (int i = 0; i < data.Length; ++i)
{
tokens = data[i].Split(',');
heightAsDouble = double.Parse(tokens[2]);
if (heightAsDouble <= numericAttributeBorders[0][0]) // Short
heightAsBinnedString = attributeValues[2][0];
else if (heightAsDouble >= numericAttributeBorders[0][1]) // Tall
heightAsBinnedString = attributeValues[2][2];
else
heightAsBinnedString = attributeValues[2][1]; // Medium
string s = tokens[0] + "," + tokens[1] + "," + heightAsBinnedString +
"," + tokens[3];
result[i] = s;
}
return result;
}
``````

## 确定结点计数

Naive Bayes 分类的关键在于计算结点计数。 在演示示例中，共有九个自变量 X 属性值 (administrative, construction, … tall) 和两个因变量属性值 (male, female)，因此总共必须计算并存储 9 * 2 = 18 个结点计数。 我的首选方法是将结点计数存储在一个三维数组 int[][][] jointCounts 中。 第一个索引表示自变量 X 属性；第二个索引表示自变量 X 属性值；第三个索引表示因变量属性值。 例如，jointCounts[0][3][1] 表示属性 0 (occupation)、属性值 3 (technology) 和 sex 1 (female)，换句话说，jointCounts[0][3][1] 中的值是职业为技术且性别为女性的训练用例的计数。 方法 MakeJointCounts 如图 4 所示。

``````static int[][][] MakeJointCounts(string[] binnedData, string[] attributes,
string[][] attributeValues)
{
int[][][] jointCounts = new int[attributes.Length - 1][][]; // -1 (no sex)
jointCounts[0] = new int[4][]; // 4 occupations
jointCounts[1] = new int[2][]; // 2 dominances
jointCounts[2] = new int[3][]; // 3 heights
jointCounts[0][0] = new int[2]; // 2 sexes for administrative
jointCounts[0][1] = new int[2]; // construction
jointCounts[0][2] = new int[2]; // education
jointCounts[0][3] = new int[2]; // technology
jointCounts[1][0] = new int[2]; // left
jointCounts[1][1] = new int[2]; // right
jointCounts[2][0] = new int[2]; // short
jointCounts[2][1] = new int[2]; // medium
jointCounts[2][2] = new int[2]; // tall
for (int i = 0; i < binnedData.Length; ++i)
{
string[] tokens = binnedData[i].Split(',');
int occupationIndex = AttributeValueToIndex(0, tokens[0]);
int dominanceIndex = AttributeValueToIndex(1, tokens[1]);
int heightIndex = AttributeValueToIndex(2, tokens[2]);
int sexIndex = AttributeValueToIndex(3, tokens[3]);
++jointCounts[0][occupationIndex][sexIndex];
++jointCounts[1][dominanceIndex][sexIndex];
++jointCounts[2][heightIndex][sexIndex];
}
return jointCounts;
}
``````

``````jointCounts[0] = new int[4][]; // 4 occupations
jointCounts[1] = new int[2][]; // 2 dominances
jointCounts[2] = new int[3][]; // 3 heights
``````

Helper 函数 AttributeValueToIndex 接受属性索引和属性值字符串并返回相应的索引。 例如，AttributeValueToIndex(2, “medium”) 返回 height 属性中“medium”的索引，也就是 1。

``````static int[] MakeDependentCounts(int[][][] jointCounts,
int numDependents)
{
int[] result = new int[numDependents];
for (int k = 0; k < numDependents; ++k)
// Male then female
for (int j = 0; j < jointCounts[0].Length; ++j)
// Scanning attribute 0
result[k] += jointCounts[0][j][k];
return result;
}
``````

## 对数据用例进行分类

``````static int Classify(string occupation, string dominance, string height,
int[][][] jointCounts, int[] dependentCounts, bool withSmoothing,
int xClasses)
{
double partProbMale = PartialProbability("male", occupation, dominance,
height, jointCounts, dependentCounts, withSmoothing, xClasses);
double partProbFemale = PartialProbability("female", occupation, dominance,
height, jointCounts, dependentCounts, withSmoothing, xClasses);
double evidence = partProbMale + partProbFemale;
double probMale = partProbMale / evidence;
double probFemale = partProbFemale / evidence;
if (probMale > probFemale) return 0;
else return 1;
}
``````

``````static double PartialProbability(string sex, string occupation, string dominance,
string height, int[][][] jointCounts, int[] dependentCounts,
bool withSmoothing, int xClasses)
{
int sexIndex = AttributeValueToIndex(3, sex);
int occupationIndex = AttributeValueToIndex(0, occupation);
int dominanceIndex = AttributeValueToIndex(1, dominance);
int heightIndex = AttributeValueToIndex(2, height);
int totalMale = dependentCounts[0];
int totalFemale = dependentCounts[1];
int totalCases = totalMale + totalFemale;
int totalToUse = 0;
if (sex == "male") totalToUse = totalMale;
else if (sex == "female") totalToUse = totalFemale;
double p0 = (totalToUse * 1.0) / (totalCases); // Prob male or female
double p1 = 0.0;
double p2 = 0.0;
double p3 = 0.0;
if (withSmoothing == false)
{
p1 = (jointCounts[0][occupationIndex][sexIndex] * 1.0) / totalToUse
p2 = (jointCounts[1][dominanceIndex][sexIndex] * 1.0) / totalToUse;
p3 = (jointCounts[2][heightIndex][sexIndex] * 1.0) / totalToUse;
}
else if (withSmoothing == true)
{
p1 = (jointCounts[0][occupationIndex][sexIndex] + 1) /
((totalToUse + xClasses) * 1.0);
p2 = (jointCounts[1][dominanceIndex][sexIndex] + 1) /
((totalToUse + xClasses) * 1.0 ;
p3 = (jointCounts[2][heightIndex][sexIndex] + 1) /
((totalToUse + xClasses) * 1.0);
}
//return p0 * p1 * p2 * p3; // Risky if any very small values
return Math.Exp(Math.Log(p0) + Math.Log(p1) + Math.Log(p2) + Math.Log(p3));
}
``````

## 总结

James McCaffrey博士 供职于 Volt Information Sciences, Inc.，在该公司他负责管理对华盛顿州雷蒙德市沃什湾 Microsoft 总部园区的软件工程师进行的技术培训。 他参与过多项 Microsoft 产品的研发工作，其中包括 Internet Explorer 和 MSN Search。 他是《.NET Test Automation Recipes》(Apress, 2006) 的作者，您可以通过以下电子邮箱地址与他联系：jammc@microsoft.com