Rename outcomes to categories; unpack model_dict global

cb3610f7 · Yaksher · fa240df0 · cb3610f7 · cb3610f7
Commit cb3610f7 authored 1 year ago by Yaksher
Hide whitespace changes
Inline Side-by-side

Showing

with 29 additions and 33 deletions
+29 -33
--- a/bayes.py
+++ b/bayes.py
@@ -11,50 +11,46 @@ def tokenize(text):
    return [y for y in [re.sub('[^a-z0-9]', '', x) for x in text.lower().split(" ")]  if len(y)]
 def train(dataset):
-    model_dict = {}
+    global count_of_word_by_category
-    count_of_word_by_outcome = {}
+    global num_data_points
+    global num_data_points_in_category
+    count_of_word_by_category = {}
    num_data_points = len(dataset)
-    count_of_data_points_with_outcome = Counter()
+    num_data_points_in_category = Counter()
    for point in dataset:
        name = point['name']
        classification = point['classification']
-        count_of_data_points_with_outcome[classification] += 1
+        num_data_points_in_category[classification] += 1
-        if classification not in count_of_word_by_outcome:
+        if classification not in count_of_word_by_category:
-            count_of_word_by_outcome[classification] = Counter()
+            count_of_word_by_category[classification] = Counter()
        words = set(tokenize(point['contents']))
        for word in words:
-            count_of_word_by_outcome[classification][word] += 1
+            count_of_word_by_category[classification][word] += 1
-    model_dict['count_of_word_by_outcome'] = count_of_word_by_outcome
-    model_dict['num_data_points'] = num_data_points
-    model_dict['count_of_data_points_with_outcome'] = count_of_data_points_with_outcome
-    return model_dict
 """
 TODO - Implement the following functions.
-In each of these functions, you can use the MODEL variable which is a dictionary which has three members:
+After training (which is run before your code), the following 3 global variables are available:
-    MODEL['count_of_word_by_outcome'][outcome][word] = Total number of documents in the category 'outcome' in which this word appears
+    count_of_word_by_category[category][word] = Total number of documents in the category 'category' in which this word appears
-    MODEL['num_data_points'] = Total number of documents in the data set
+    num_data_points = Total number of documents in the data set
-    MODEL['count_of_data_points_with_outcome'][outcome] = Total number of documents in the category 'outcome'
+    num_data_points_in_category[category] = Total number of documents in the category 'category'
 """
 @cache
-def pr_outcome(outcome : str) : # Pr(outcome)
+def pr_category(category : str) : # Pr(category)
    return 0
 @cache
-def pr_word_given_outcome(word : str, outcome : str, num_words_in_document : int): # Pr(word | outcome)
+def pr_word_given_category(word : str, category : str, num_words_in_document : int): # Pr(word | category)
    return 0
-def pr_outcome_given_words(words : List[str], outcome : str): # Pr(outcome | words)
+def pr_category_given_words(words : List[str], category : str): # Pr(category | words)
    return 0
-def predict(outcomes, words):
+def predict(categories, words):
    best = None
    best_likelihood = -inf
-    for outcome in outcomes:
+    for category in categories:
-        pr = pr_outcome_given_words(words, outcome)
+        pr = pr_category_given_words(words, category)
        if  pr > best_likelihood:
-            best = outcome
+            best = category
            best_likelihood = pr
    return best
--- a/main.py
+++ b/main.py
@@ -8,25 +8,25 @@ VALIDATE = 'data/validate.json'
 train = json.loads(open(TRAIN).read())
 validate = json.loads(open(VALIDATE).read())
-def test(dataset, outcomes):
+def test(dataset, categories):
    answers = dict([x.split(" ") for x in open(dataset + "_validate.txt").read().split("\n")[:-1]])
-    bayes.model_dict = bayes.train(train[dataset])
+    bayes.train(train[dataset])
-    correct_by_outcome = Counter()
+    correct_by_category = Counter()
-    incorrect_by_outcome = Counter()
+    incorrect_by_category = Counter()
    for point in validate[dataset]:
        words = set(bayes.tokenize(point['contents']))
-        prediction = bayes.predict(outcomes, words)
+        prediction = bayes.predict(categories, words)
        answer = answers[point['name']]
        if prediction == answer:
-            correct_by_outcome[answer] += 1
+            correct_by_category[answer] += 1
        else:
-            incorrect_by_outcome[answer] += 1
+            incorrect_by_category[answer] += 1
-    print(correct_by_outcome)
+    print(correct_by_category)
-    print(incorrect_by_outcome)
+    print(incorrect_by_category)
 test('tweets', set(['positive', 'negative']))
 test('emails', set(['spam', 'ham']))