2016 年 5 月

# 測試回合 - 多臂吃角子老虎問題

[圖 1 多武裝的強盜問題上使用瀏覽攻擊

[圖 2 完成多武裝的強盜示範程式碼

``````using System;
namespace MultiBandit
{
class MultiBanditProgram
{
static void Main(string[] args)
{
Console.WriteLine("\nBegin multi-armed bandit demo \n");
Console.WriteLine("Creating 3 Gaussian machines");
Console.WriteLine("Machine 0 mean =  0.0, sd = 1.0");
Console.WriteLine("Machine 1 mean = -0.5, sd = 2.0");
Console.WriteLine("Machine 2 mean =  0.1, sd = 0.5");
Console.WriteLine("Best machine is [2] mean pay = 0.1");
int nMachines = 3;
Machine[] machines = new Machine[nMachines];
machines[0] = new Machine(0.0, 1.0, 0);
machines[1] = new Machine(-0.5, 2.0, 1);
machines[2] = new Machine(0.1, 0.5, 2);
int nPulls = 20;
double pctExplore = 0.40;
Console.WriteLine("Setting nPulls = " + nPulls);
Console.WriteLine("\nUsing pctExplore = " +
pctExplore.ToString("F2"));
double avgPay = ExploreExploit(machines, pctExplore,
nPulls);
double totPay = avgPay * nPulls;
Console.WriteLine("\nAverage pay per pull = " +
avgPay.ToString("F2"));
Console.WriteLine("Total payout         = " +
totPay.ToString("F2"));
double avgBase = machines[2].mean;
double totBase = avgBase * nPulls;
Console.WriteLine("\nBaseline average pay = " +
avgBase.ToString("F2"));
Console.WriteLine("Total baseline pay   = " +
totBase.ToString("F2"));
double regret = totBase - totPay;
Console.WriteLine("\nTotal regret = " +
regret.ToString("F2"));
Console.WriteLine("\nEnd bandit demo \n");
} // Main
static double ExploreExploit(Machine[] machines,
double pctExplore, int nPulls)
{
// Use basic explore-exploit algorithm
// Return the average pay per pull
int nMachines = machines.Length;
Random r = new Random(2); // which machine
double[] explorePays = new double[nMachines];
double totPay = 0.0;
int nExplore = (int)(nPulls * pctExplore);
int nExploit = nPulls - nExplore;
Console.WriteLine("\nStart explore phase");
for (int pull = 0; pull < nExplore; ++pull)
{
int m = r.Next(0, nMachines); // pick a machine
double pay = machines[m].Pay(); // play
Console.Write("[" + pull.ToString().PadLeft(3) + "]  ");
Console.WriteLine("selected machine " + m + ". pay = " +
explorePays[m] += pay; // update
totPay += pay;
} // Explore
int bestMach = BestIdx(explorePays);
Console.WriteLine("\nBest machine found = " + bestMach);
Console.WriteLine("\nStart exploit phase");
for (int pull = 0; pull < nExploit; ++pull)
{
double pay = machines[bestMach].Pay();
Console.Write("[" + pull.ToString().PadLeft(3) + "] ");
Console.WriteLine("pay = " +
totPay += pay; // accumulate
} // Exploit
} // ExploreExploit
static int BestIdx(double[] pays)
{
// Index of array with largest value
int result = 0;
double maxVal = pays[0];
for (int i = 0; i < pays.Length; ++i)
{
if (pays[i] > maxVal)
{
result = i;
maxVal = pays[i];
}
}
return result;
}
} // Program class
public class Machine
{
public double mean; // Avg payout per pull
public double sd; // Variability about the mean
private Gaussian g; // Payout generator
public Machine(double mean, double sd, int seed)
{
this.mean = mean;
this.sd = sd;
this.g = new Gaussian(mean, sd, seed);
}
public double Pay()
{
return this.g.Next();
}
// -----
private class Gaussian
{
private Random r;
private double mean;
private double sd;
public Gaussian(double mean, double sd, int seed)
{
this.r = new Random(seed);
this.mean = mean;
this.sd = sd;
}
public double Next()
{
double u1 = r.NextDouble();
double u2 = r.NextDouble();
double left = Math.Cos(2.0 * Math.PI * u1);
double right = Math.Sqrt(-2.0 * Math.Log(u2));
double z = left * right;
return this.mean + (z * this.sd);
}
}
// -----
} // Machine
} // ns
``````

## 示範程式

``````int nMachines = 3;
Machine[] machines = new Machine[nMachines];
machines[0] = new Machine(0.0, 1.0, 0);
machines[1] = new Machine(-0.5, 2.0, 1);
machines[2] = new Machine(0.1, 0.5, 2);
``````

``````int nPulls = 20;
double pctExplore = 0.40;
double avgPay = ExploreExploit(machines, pctExplore, nPulls);
double totPay = avgPay * nPulls;
``````

ExploreExploit 方法會傳回平均獲得 （或遺失，如果是負數） 每個提取 nPulls 隨機事件之後。因此，從工作階段總數的付款是平均的付款提取每次提取數目。替代的設計是讓 ExploreExploit 傳回而不是平均付款總付款。

``````double avgBase = machines[2].mean;
double totBase = avgBase * nPulls;
double regret = totBase - totPay;
``````

[圖 3 100 隨機高斯值

## 機器類別

``````public class Machine
{
public double mean; // Avg payout per pull
public double sd; // Variability about the mean
private Gaussian g; // Payout generator
...
``````

``````public Machine(double mean, double sd, int seed)
{
this.mean = mean;
this.sd = sd;
this.g = new Gaussian(mean, sd, seed);
}
``````

``````public double Pay()
{
return this.g.Next();
}
``````

## 瀏覽利用實作

ExploreExploit 方法定義的開頭 ︰

``````static double ExploreExploit(Machine[] machines, double pctExplore,
int nPulls)
{
int nMachines = machines.Length;
Random r = new Random(2); // Which machine
double[] explorePays = new double[nMachines];
double totPay = 0.0;
...
``````

``````int nExplore = (int)(nPulls * pctExplore);
int nExploit = nPulls - nExplore;
``````

``````for (int pull = 0; pull < nExplore; ++pull)
{
int m = r.Next(0, nMachines); // Pick a machine
double pay = machines[m].Pay(); // Play
explorePays[m] += pay; // Update
totPay += pay;
}
``````

Random.Next （int minVal，int maxVal） 傳回整數值之間 （含） 之間的 minVal 和 maxVal （獨佔），因此如果 nMachines = 3，r.Next (0，nMachines) 會傳回隨機整數值 0、 1 或 2。

``````int bestMach = BestIdx(explorePays);
for (int pull = 0; pull < nExploit; ++pull)
{
double pay = machines[bestMach].Pay();
totPay += pay; // Accumulate
}
``````

BestIdx 的程式定義的 helper 方法會傳回其保留的最大值的陣列引數的儲存格的索引。有多種變化的多重武裝的強盜問題。例如，一些變化定義不同的方式瀏覽階段期間找到的最佳機器。在我 cranky 看來，這些差異大多只搜尋研究問題的解決方案。

``````. . .