Revert "Rename outcomes to categories; unpack model_dict global"

This reverts commit eb1b7980.

Revert "Rename outcomes to categories; unpack model_dict global"
This reverts commit eb1b7980.
fa240df0 · Yaksher · eb1b7980 · fa240df0 · fa240df0
Commit fa240df0 authored 1 year ago by Yaksher
Hide whitespace changes
Inline Side-by-side

Showing

with 34 additions and 37 deletions
+34 -37
--- a/bayes.py
+++ b/bayes.py
@@ -4,56 +4,57 @@ from math import log, inf
 from functools import cache
 from typing import List

+"""After training, model_dict is a global variable which is accessible inside this module"""
+
 @cache
 def tokenize(text):
    return [y for y in [re.sub('[^a-z0-9]', '', x) for x in text.lower().split(" ")]  if len(y)]

 def train(dataset):
-    count_of_word_by_category = {}
+    model_dict = {}
+    count_of_word_by_outcome = {}
    num_data_points = len(dataset)
-    count_of_data_points_in_category = Counter()
+    count_of_data_points_with_outcome = Counter()
    for point in dataset:
        name = point['name']
        classification = point['classification']
-        count_of_data_points_in_category[classification] += 1
-        if classification not in count_of_word_by_category:
-            count_of_word_by_category[classification] = Counter()
+        count_of_data_points_with_outcome[classification] += 1
+        if classification not in count_of_word_by_outcome:
+            count_of_word_by_outcome[classification] = Counter()
        words = set(tokenize(point['contents']))
        for word in words:
-            count_of_word_by_category[classification][word] += 1
+            count_of_word_by_outcome[classification][word] += 1
+
+    model_dict['count_of_word_by_outcome'] = count_of_word_by_outcome
+    model_dict['num_data_points'] = num_data_points
+    model_dict['count_of_data_points_with_outcome'] = count_of_data_points_with_outcome

-    return (count_of_word_by_category, num_data_points, count_of_data_points_in_category)
+    return model_dict

 """
 TODO - Implement the following functions.
-
-After training (which will be run before your code), the following global 
-variables are available:
-    count_of_word_by_category[category][word] = Total number of documents in the 
-        category 'category' in which this word appears
-
-    num_data_points = Total number of documents in the data set
-
-    count_of_data_points_in_category[category] = Total number of documents in 
-        the category 'category'
+In each of these functions, you can use the MODEL variable which is a dictionary which has three members:
+    MODEL['count_of_word_by_outcome'][outcome][word] = Total number of documents in the category 'outcome' in which this word appears
+    MODEL['num_data_points'] = Total number of documents in the data set
+    MODEL['count_of_data_points_with_outcome'][outcome] = Total number of documents in the category 'outcome'
 """
 @cache
-def pr_category(category : str) : # Pr(category)
+def pr_outcome(outcome : str) : # Pr(outcome)
    return 0

 @cache
-def pr_word_given_category(word : str, category : str, num_words_in_document : int): # Pr(word | category)
+def pr_word_given_outcome(word : str, outcome : str, num_words_in_document : int): # Pr(word | outcome)
    return 0

-def pr_category_given_words(words : List[str], category : str): # Pr(category | words)
+def pr_outcome_given_words(words : List[str], outcome : str): # Pr(outcome | words)
    return 0

-def predict(categories, words):
+def predict(outcomes, words):
    best = None
    best_likelihood = -inf
-    for category in categories:
-        pr = pr_category_given_words(words, category)
+    for outcome in outcomes:
+        pr = pr_outcome_given_words(words, outcome)
        if  pr > best_likelihood:
-            best = category
+            best = outcome
            best_likelihood = pr
    return best
--- a/main.py
+++ b/main.py
@@ -8,29 +8,25 @@ VALIDATE = 'data/validate.json'
 train = json.loads(open(TRAIN).read())
 validate = json.loads(open(VALIDATE).read())

-def test(dataset, categories):
+def test(dataset, outcomes):
    answers = dict([x.split(" ") for x in open(dataset + "_validate.txt").read().split("\n")[:-1]])

-    (
-        bayes.count_of_word_by_category,
-        bayes.num_data_points,
-        bayes.count_of_data_points_in_category
-    ) = bayes.train(train[dataset])
+    bayes.model_dict = bayes.train(train[dataset])

-    correct_by_category = Counter()
-    incorrect_by_category = Counter()
+    correct_by_outcome = Counter()
+    incorrect_by_outcome = Counter()

    for point in validate[dataset]:
        words = set(bayes.tokenize(point['contents']))
-        prediction = bayes.predict(categories, words)
+        prediction = bayes.predict(outcomes, words)
        answer = answers[point['name']]
        if prediction == answer:
-            correct_by_category[answer] += 1
+            correct_by_outcome[answer] += 1
        else:
-            incorrect_by_category[answer] += 1
+            incorrect_by_outcome[answer] += 1

-    print(correct_by_category)
-    print(incorrect_by_category)
+    print(correct_by_outcome)
+    print(incorrect_by_outcome)

 test('tweets', set(['positive', 'negative']))
 test('emails', set(['spam', 'ham']))