From eb1b7980030e898d910ce48b14ec88698ffa313e Mon Sep 17 00:00:00 2001
From: Yaksher <yaksher.git@gmail.com>
Date: Sat, 11 Nov 2023 22:43:56 -0800
Subject: [PATCH] Rename outcomes to categories; unpack model_dict global

---
 bayes.py | 49 ++++++++++++++++++++++++-------------------------
 main.py  | 22 +++++++++++++---------
 2 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/bayes.py b/bayes.py
index 6752762a..54bb9878 100644
--- a/bayes.py
+++ b/bayes.py
@@ -4,57 +4,56 @@ from math import log, inf
 from functools import cache
 from typing import List
 
-"""After training, model_dict is a global variable which is accessible inside this module"""
-
 @cache
 def tokenize(text):
     return [y for y in [re.sub('[^a-z0-9]', '', x) for x in text.lower().split(" ")]  if len(y)]
 
 def train(dataset):
-    model_dict = {}
-    count_of_word_by_outcome = {}
+    count_of_word_by_category = {}
     num_data_points = len(dataset)
-    count_of_data_points_with_outcome = Counter()
+    count_of_data_points_in_category = Counter()
     for point in dataset:
         name = point['name']
         classification = point['classification']
-        count_of_data_points_with_outcome[classification] += 1
-        if classification not in count_of_word_by_outcome:
-            count_of_word_by_outcome[classification] = Counter()
+        count_of_data_points_in_category[classification] += 1
+        if classification not in count_of_word_by_category:
+            count_of_word_by_category[classification] = Counter()
         words = set(tokenize(point['contents']))
         for word in words:
-            count_of_word_by_outcome[classification][word] += 1
-
-    model_dict['count_of_word_by_outcome'] = count_of_word_by_outcome
-    model_dict['num_data_points'] = num_data_points
-    model_dict['count_of_data_points_with_outcome'] = count_of_data_points_with_outcome
+            count_of_word_by_category[classification][word] += 1
 
-    return model_dict
+    return (count_of_word_by_category, num_data_points, count_of_data_points_in_category)
 
 """
 TODO - Implement the following functions.
-In each of these functions, you can use the MODEL variable which is a dictionary which has three members:
-    MODEL['count_of_word_by_outcome'][outcome][word] = Total number of documents in the category 'outcome' in which this word appears
-    MODEL['num_data_points'] = Total number of documents in the data set
-    MODEL['count_of_data_points_with_outcome'][outcome] = Total number of documents in the category 'outcome'
+
+After training (which will be run before your code), the following global 
+variables are available:
+    count_of_word_by_category[category][word] = Total number of documents in the 
+        category 'category' in which this word appears
+
+    num_data_points = Total number of documents in the data set
+
+    count_of_data_points_in_category[category] = Total number of documents in 
+        the category 'category'
 """
 @cache
-def pr_outcome(outcome : str) : # Pr(outcome)
+def pr_category(category : str) : # Pr(category)
     return 0
 
 @cache
-def pr_word_given_outcome(word : str, outcome : str, num_words_in_document : int): # Pr(word | outcome)
+def pr_word_given_category(word : str, category : str, num_words_in_document : int): # Pr(word | category)
     return 0
 
-def pr_outcome_given_words(words : List[str], outcome : str): # Pr(outcome | words)
+def pr_category_given_words(words : List[str], category : str): # Pr(category | words)
     return 0
 
-def predict(outcomes, words):
+def predict(categories, words):
     best = None
     best_likelihood = -inf
-    for outcome in outcomes:
-        pr = pr_outcome_given_words(words, outcome)
+    for category in categories:
+        pr = pr_category_given_words(words, category)
         if  pr > best_likelihood:
-            best = outcome
+            best = category
             best_likelihood = pr
     return best
diff --git a/main.py b/main.py
index 191ba508..b15ff249 100644
--- a/main.py
+++ b/main.py
@@ -8,25 +8,29 @@ VALIDATE = 'data/validate.json'
 train = json.loads(open(TRAIN).read())
 validate = json.loads(open(VALIDATE).read())
 
-def test(dataset, outcomes):
+def test(dataset, categories):
     answers = dict([x.split(" ") for x in open(dataset + "_validate.txt").read().split("\n")[:-1]])
 
-    bayes.model_dict = bayes.train(train[dataset])
+    (
+        bayes.count_of_word_by_category,
+        bayes.num_data_points,
+        bayes.count_of_data_points_in_category
+    ) = bayes.train(train[dataset])
 
-    correct_by_outcome = Counter()
-    incorrect_by_outcome = Counter()
+    correct_by_category = Counter()
+    incorrect_by_category = Counter()
 
     for point in validate[dataset]:
         words = set(bayes.tokenize(point['contents']))
-        prediction = bayes.predict(outcomes, words)
+        prediction = bayes.predict(categories, words)
         answer = answers[point['name']]
         if prediction == answer:
-            correct_by_outcome[answer] += 1
+            correct_by_category[answer] += 1
         else:
-            incorrect_by_outcome[answer] += 1
+            incorrect_by_category[answer] += 1
 
-    print(correct_by_outcome)
-    print(incorrect_by_outcome)
+    print(correct_by_category)
+    print(incorrect_by_category)
 
 test('tweets', set(['positive', 'negative']))
 test('emails', set(['spam', 'ham']))
-- 
GitLab