1: /*
2: Copyright (c) 2010 <a href="http://www.gutgames.com">James Craig</a>
3:
4: Permission is hereby granted, free of charge, to any person obtaining a copy
5: of this software and associated documentation files (the "Software"), to deal
6: in the Software without restriction, including without limitation the rights
7: to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8: copies of the Software, and to permit persons to whom the Software is
9: furnished to do so, subject to the following conditions:
10:
11: The above copyright notice and this permission notice shall be included in
12: all copies or substantial portions of the Software.
13:
14: THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15: IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16: FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17: AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19: OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20: THE SOFTWARE.*/
21:
22: #region Usings
23: using Utilities.DataTypes;
24: using System.Collections.Generic;
25: using Utilities.Math;
26: #endregion
27:
28: namespace Utilities.Classifier.NaiveBayes
29: {
30: /// <summary>
31: /// Naive bayes classifier
32: /// </summary>
33: /// <typeparam name="T">The type of the individual tokens</typeparam>
34: public class NaiveBayes<T>
35: {
36: #region Constructor
37:
38: /// <summary>
39: /// Constructor
40: /// </summary>
41: public NaiveBayes()
42: {
43: SetA = new Bag<T>();
44: SetB = new Bag<T>();
45: Probabilities = new Dictionary<T, double>();
46: Total = 0;
47: TotalA = 0;
48: TotalB = 0;
49: ATokenWeight = 1;
50: BTokenWeight = 1;
51: MinCountForInclusion = 1;
52: MinTokenProbability = 0.01;
53: MaxTokenProbability = 0.999;
54: MaxInterestingTokenCount = int.MaxValue;
55: }
56:
57: #endregion
58:
59: #region Properties
60:
61: /// <summary>
62: /// Set A
63: /// </summary>
64: public Bag<T> SetA { get; set; }
65:
66: /// <summary>
67: /// Set B
68: /// </summary>
69: public Bag<T> SetB { get; set; }
70:
71: private double Total { get; set; }
72: private double TotalA { get; set; }
73: private double TotalB { get; set; }
74: private Dictionary<T, double> Probabilities { get; set; }
75:
76: /// <summary>
77: /// Weight to give to the probabilities in set A
78: /// </summary>
79: public int ATokenWeight { get; set; }
80:
81: /// <summary>
82: /// Weight to give the probabilities in set B
83: /// </summary>
84: public int BTokenWeight { get; set; }
85:
86: /// <summary>
87: /// Minimum count that an item needs to be found to be included in final probability
88: /// </summary>
89: public int MinCountForInclusion { get; set; }
90:
91: /// <summary>
92: /// Minimum token probability (if less than this amount, it becomes this amount)
93: /// </summary>
94: public double MinTokenProbability { get; set; }
95:
96: /// <summary>
97: /// Maximum token probability (if greater than this amount, it becomes this amount)
98: /// </summary>
99: public double MaxTokenProbability { get; set; }
100:
101: /// <summary>
102: /// After sorting, this is the maximum number of tokens that are picked to figure out the final probability
103: /// </summary>
104: public int MaxInterestingTokenCount { get; set; }
105:
106: #endregion
107:
108: #region Public Functions
109:
110: /// <summary>
111: /// Loads a set of tokens
112: /// </summary>
113: /// <param name="SetATokens">Set A</param>
114: /// <param name="SetBTokens">Set B</param>
115: public void LoadTokens(System.Collections.Generic.List<T> SetATokens, System.Collections.Generic.List<T> SetBTokens)
116: {
117: foreach (T TokenA in SetATokens)
118: {
119: SetA.Add(TokenA);
120: }
121: foreach (T TokenB in SetBTokens)
122: {
123: SetB.Add(TokenB);
124: }
125: TotalA = 0;
126: TotalB = 0;
127: foreach (T Token in SetA)
128: {
129: TotalA += SetA[Token];
130: }
131: foreach (T Token in SetB)
132: {
133: TotalB += SetB[Token];
134: }
135: Total = TotalA + TotalB;
136: Probabilities = new Dictionary<T, double>();
137: foreach (T Token in SetA)
138: {
139: Probabilities.Add(Token, CalculateProbabilityOfToken(Token));
140: }
141: foreach (T Token in SetB)
142: {
143: if (!Probabilities.ContainsKey(Token))
144: {
145: Probabilities.Add(Token, CalculateProbabilityOfToken(Token));
146: }
147: }
148: }
149:
150: /// <summary>
151: /// Calculates the probability of the list of tokens being in set A
152: /// </summary>
153: /// <param name="Items">List of items</param>
154: /// <returns>The probability that the tokens are from set A</returns>
155: public double CalculateProbabilityOfTokens(System.Collections.Generic.List<T> Items)
156: {
157: SortedList<string, double> SortedProbabilities = new SortedList<string, double>();
158: for (int x = 0; x < Items.Count; ++x)
159: {
160: double TokenProbability = 0.5;
161: if (Probabilities.ContainsKey(Items[x]))
162: {
163: TokenProbability = Probabilities[Items[x]];
164: }
165: string Difference = ((0.5 - System.Math.Abs(0.5 - TokenProbability))).ToString(".0000000") + Items[x] + x;
166: SortedProbabilities.Add(Difference, TokenProbability);
167: }
168: double TotalProbability = 1;
169: double NegativeTotalProbability = 1;
170: int Count = 0;
171: int MaxCount=MathHelper.Min(SortedProbabilities.Count, MaxInterestingTokenCount);
172: foreach(string Probability in SortedProbabilities.Keys)
173: {
174: double TokenProbability = SortedProbabilities[Probability];
175: TotalProbability *= TokenProbability;
176: NegativeTotalProbability *= (1 - TokenProbability);
177: ++Count;
178: if (Count >= MaxCount)
179: break;
180: }
181: return TotalProbability / (TotalProbability + NegativeTotalProbability);
182: }
183:
184: #endregion
185:
186: #region Private Functions
187:
188: /// <summary>
189: /// Calculates a single items probability of being in set A
190: /// </summary>
191: /// <param name="Item">Item to calculate</param>
192: /// <returns>The probability that the token is from set A</returns>
193: private double CalculateProbabilityOfToken(T Item)
194: {
195: double Probability = 0;
196: int ACount = SetA.Contains(Item) ? SetA[Item] * ATokenWeight : 0;
197: int BCount = SetB.Contains(Item) ? SetB[Item] * BTokenWeight : 0;
198: if (ACount + BCount >= MinCountForInclusion)
199: {
200: double AProbability=MathHelper.Min(1,(double)ACount/(double)TotalA);
201: double BProbability=MathHelper.Min(1,(double)BCount/(double)TotalB);
202: Probability = MathHelper.Max(MinTokenProbability,
203: MathHelper.Min(MaxTokenProbability, AProbability / (AProbability + BProbability)));
204: }
205: return Probability;
206: }
207:
208: #endregion
209: }
210: }